n8n-workflows/create_categories.py

import json
import os
from pathlib import Path
import glob
import re

def load_def_categories():
    """Load the definition categories from def_categories.json"""
    def_categories_path = Path("context/def_categories.json")
    with open(def_categories_path, 'r', encoding='utf-8') as f:
        raw_map = json.load(f)

    # Normalize keys: strip non-alphanumerics and lowercase
    integration_to_category = {
        re.sub(r"[^a-z0-9]", "", item["integration"].lower()): item["category"]
        for item in raw_map
    }
    return integration_to_category

def extract_tokens_from_filename(filename):
    """Extract tokens from filename by splitting on '_' and removing '.json'"""
    # Remove .json extension
    name_without_ext = filename.replace('.json', '')

    # Split by underscore
    tokens = name_without_ext.split('_')

    # Convert to lowercase for matching
    tokens = [token.lower() for token in tokens if token]

    return tokens

def find_matching_category(tokens, integration_to_category):
    """Find the first matching category for the given tokens"""
    for token in tokens:
        # Normalize token same as keys
        norm = re.sub(r"[^a-z0-9]", "", token.lower())
        if norm in integration_to_category:
            return integration_to_category[norm]

    # Try partial matches for common variations
    for token in tokens:
        norm = re.sub(r"[^a-z0-9]", "", token.lower())
        for integration_key in integration_to_category:
            if norm in integration_key or integration_key in norm:
                return integration_to_category[integration_key]

    return ""

def main():
    # Load definition categories
    integration_to_category = load_def_categories()

    # Get all JSON files from workflows directory
    workflows_dir = Path("workflows")
    json_files = glob.glob(
        os.path.join(workflows_dir, "**", "*.json"),
        recursive=True
    )

    # Process each file
    search_categories = []

    for json_file in json_files:
        path_obj = Path(json_file)
        filename = path_obj.name
        tokens = extract_tokens_from_filename(filename)
        category = find_matching_category(tokens, integration_to_category)

        search_categories.append({
            "filename": filename,
            "category": category
        })

    # Sort by filename for consistency
    search_categories.sort(key=lambda x: x['filename'])

    # Write to search_categories.json
    output_path = Path("context/search_categories.json")
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(search_categories, f, indent=2, ensure_ascii=False)

    print(f"Generated search_categories.json with {len(search_categories)} entries")

    # Generate unique categories list for API
    unique_categories = set()
    for item in search_categories:
        if item['category']:
            unique_categories.add(item['category'])

    # Always include 'Uncategorized' for workflows without categories
    unique_categories.add('Uncategorized')

    # Sort categories alphabetically
    categories_list = sorted(list(unique_categories))

    # Write unique categories to a separate file for API consumption
    categories_output_path = Path("context/unique_categories.json")
    with open(categories_output_path, 'w', encoding='utf-8') as f:
        json.dump(categories_list, f, indent=2, ensure_ascii=False)

    print(f"Generated unique_categories.json with {len(categories_list)} categories")

    # Print some statistics
    categorized = sum(1 for item in search_categories if item['category'])
    uncategorized = len(search_categories) - categorized
    print(f"Categorized: {categorized}, Uncategorized: {uncategorized}")

    # Print detailed category statistics
    print("\n" + "="*50)
    print("CATEGORY DISTRIBUTION (Top 20)")
    print("="*50)

    # Count categories
    category_counts = {}
    for item in search_categories:
        category = item['category'] if item['category'] else "Uncategorized"
        category_counts[category] = category_counts.get(category, 0) + 1

    # Sort by count (descending)
    sorted_categories = sorted(category_counts.items(), key=lambda x: x[1], reverse=True)

    # Display top 20
    for i, (category, count) in enumerate(sorted_categories[:20], 1):
        print(f"{i:2d}. {category:<40} {count:>4} files")

    if len(sorted_categories) > 20:
        remaining = len(sorted_categories) - 20
        print(f"\n... and {remaining} more categories")

    # Write tips on uncategorized workflows
    print("\n" + "="*50)
    print("Tips on uncategorized workflows")
    print("="*50)
    print("1. At the search, you'll be able to list all uncategorized workflows.")
    print("2. If the workflow JSON filename has a clear service name (eg. Twilio), it could just be we are missing its category definition at context/def_categories.json.")
    print("3. You can contribute to the category definitions and then make a pull request to help improve the search experience.")


    # Done message
    print("\n" + "="*50)
    print("Done! Search re-indexed with categories.")
    print("="*50)

if __name__ == "__main__":
    main()