create_categories.py

import json
import os
from pathlib import Path
import glob
import re

def load_def_categories():
    """Load the definition categories from def_categories.json"""
    def_categories_path = Path("context/def_categories.json")
    with open(def_categories_path, 'r', encoding='utf-8') as f:
        raw_map = json.load(f)

    # Normalize keys: strip non-alphanumerics and lowercase
    integration_to_category = {
        re.sub(r"[^a-z0-9]", "", item["integration"].lower()): item["category"]
        for item in raw_map
    }
    return integration_to_category

def extract_tokens_from_filename(filename):
    """Extract tokens from filename by splitting on '_' and removing '.json'"""
    # Remove .json extension
    name_without_ext = filename.replace('.json', '')
    
    # Split by underscore
    tokens = name_without_ext.split('_')
    
    # Convert to lowercase for matching
    tokens = [token.lower() for token in tokens if token]
    
    return tokens

def find_matching_category(tokens, integration_to_category):
    """Find the first matching category for the given tokens"""
    for token in tokens:
        # Normalize token same as keys
        norm = re.sub(r"[^a-z0-9]", "", token.lower())
        if norm in integration_to_category:
            return integration_to_category[norm]
    
    # Try partial matches for common variations
    for token in tokens:
        norm = re.sub(r"[^a-z0-9]", "", token.lower())
        for integration_key in integration_to_category:
            if norm in integration_key or integration_key in norm:
                return integration_to_category[integration_key]
    
    return ""

def categorize_by_filename(filename):
    """
    Categorize workflow based on filename patterns.
    Returns the most likely category or None if uncertain.
    """
    filename_lower = filename.lower()
    
    # Security & Authentication
    if any(word in filename_lower for word in ['totp', 'bitwarden', 'auth', 'security']):
        return "Technical Infrastructure & DevOps"

    # Data Processing & File Operations
    if any(word in filename_lower for word in ['process', 'writebinaryfile', 'readbinaryfile', 'extractfromfile', 'converttofile', 'googlefirebasecloudfirestore', 'supabase', 'surveymonkey', 'renamekeys', 'readpdf', 'wufoo', 'splitinbatches', 'airtop', 'comparedatasets', 'spreadsheetfile', 'calcslive']):
        return "Data Processing & Analysis"

    # Utility & Business Process Automation
    if any(word in filename_lower for word in ['noop', 'code', 'schedule', 'filter', 'splitout', 'wait', 'limit', 'aggregate', 'acuityscheduling', 'eventbrite', 'philipshue', 'stickynote', 'n8ntrainingcustomerdatastore', 'n8n']):
        return "Business Process Automation"

    # Webhook & API related
    if any(word in filename_lower for word in ['webhook', 'respondtowebhook', 'http', 'rssfeedread']):
        return "Web Scraping & Data Extraction"

    # Form & Data Collection
    if any(word in filename_lower for word in ['form', 'typeform', 'jotform']):
        return "Data Processing & Analysis"

    # Local file operations
    if any(word in filename_lower for word in ['localfile', 'filemaker']):
        return "Cloud Storage & File Management"

    # Database operations
    if any(word in filename_lower for word in ['postgres', 'mysql', 'mongodb', 'redis', 'elasticsearch', 'snowflake']):
        return "Data Processing & Analysis"

    # AI & Machine Learning
    if any(word in filename_lower for word in ['openai', 'awstextract', 'awsrekognition', 'humanticai', 'openthesaurus', 'googletranslate', 'summarize']):
        return "AI Agent Development"

    # E-commerce specific
    if any(word in filename_lower for word in ['woocommerce', 'gumroad']):
        return "E-commerce & Retail"

    # Social media specific
    if any(word in filename_lower for word in ['facebook', 'linkedin', 'instagram']):
        return "Social Media Management"

    # Customer support
    if any(word in filename_lower for word in ['zendesk', 'intercom', 'drift', 'pagerduty']):
        return "Communication & Messaging"

    # Analytics & Tracking
    if any(word in filename_lower for word in ['googleanalytics', 'segment', 'mixpanel']):
        return "Data Processing & Analysis"

    # Development tools
    if any(word in filename_lower for word in ['git', 'github', 'gitlab', 'travisci', 'jenkins', 'uptimerobot', 'gsuiteadmin', 'debughelper', 'bitbucket']):
        return "Technical Infrastructure & DevOps"

    # CRM & Sales tools
    if any(word in filename_lower for word in ['pipedrive', 'hubspot', 'salesforce', 'copper', 'orbit', 'agilecrm']):
        return "CRM & Sales"

    # Marketing tools
    if any(word in filename_lower for word in ['mailchimp', 'convertkit', 'sendgrid', 'mailerlite', 'lemlist', 'sendy', 'postmark', 'mailgun']):
        return "Marketing & Advertising Automation"

    # Project management
    if any(word in filename_lower for word in ['asana', 'mondaycom', 'clickup', 'trello', 'notion', 'toggl', 'microsofttodo', 'calendly', 'jira']):
        return "Project Management"

    # Communication
    if any(word in filename_lower for word in ['slack', 'telegram', 'discord', 'mattermost', 'twilio', 'emailreadimap', 'teams', 'gotowebinar']):
        return "Communication & Messaging"

    # Cloud storage
    if any(word in filename_lower for word in ['dropbox', 'googledrive', 'onedrive', 'awss3', 'googledocs']):
        return "Cloud Storage & File Management"

    # Creative tools
    if any(word in filename_lower for word in ['canva', 'figma', 'bannerbear', 'editimage']):
        return "Creative Design Automation"

    # Video & content
    if any(word in filename_lower for word in ['youtube', 'vimeo', 'storyblok', 'strapi']):
        return "Creative Content & Video Automation"

    # Financial tools
    if any(word in filename_lower for word in ['stripe', 'chargebee', 'quickbooks', 'harvest']):
        return "Financial & Accounting"

    # Weather & external APIs
    if any(word in filename_lower for word in ['openweathermap', 'nasa', 'crypto', 'coingecko']):
        return "Web Scraping & Data Extraction"

    return ""

def main():
    # Load definition categories
    integration_to_category = load_def_categories()
    
    # Get all JSON files from workflows directory
    workflows_dir = Path("workflows")
    json_files = glob.glob(
        os.path.join(workflows_dir, "**", "*.json"),
        recursive=True
    ) 
    
    # Process each file
    search_categories = []
    
    for json_file in json_files:
        path_obj = Path(json_file)
        filename = path_obj.name
        tokens = extract_tokens_from_filename(filename)
        category = find_matching_category(tokens, integration_to_category)
        
        search_categories.append({
            "filename": filename,
            "category": category
        })

    # Second pass for categorization
    for item in search_categories:
        if not item['category']:
            item['category'] = categorize_by_filename(item['filename'])
    
    # Sort by filename for consistency
    search_categories.sort(key=lambda x: x['filename'])
    
    # Write to search_categories.json
    output_path = Path("context/search_categories.json")
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(search_categories, f, indent=2, ensure_ascii=False)
    
    print(f"Generated search_categories.json with {len(search_categories)} entries")
    
    # Generate unique categories list for API
    unique_categories = set()
    for item in search_categories:
        if item['category']:
            unique_categories.add(item['category'])
    
    # Always include 'Uncategorized' for workflows without categories
    unique_categories.add('Uncategorized')
    
    # Sort categories alphabetically
    categories_list = sorted(list(unique_categories))
    
    # Write unique categories to a separate file for API consumption
    categories_output_path = Path("context/unique_categories.json")
    with open(categories_output_path, 'w', encoding='utf-8') as f:
        json.dump(categories_list, f, indent=2, ensure_ascii=False)
    
    print(f"Generated unique_categories.json with {len(categories_list)} categories")
    
    # Print some statistics
    categorized = sum(1 for item in search_categories if item['category'])
    uncategorized = len(search_categories) - categorized
    print(f"Categorized: {categorized}, Uncategorized: {uncategorized}")
    
    # Print detailed category statistics
    print("\n" + "="*50)
    print("CATEGORY DISTRIBUTION (Top 20)")
    print("="*50)
    
    # Count categories
    category_counts = {}
    for item in search_categories:
        category = item['category'] if item['category'] else "Uncategorized"
        category_counts[category] = category_counts.get(category, 0) + 1
    
    # Sort by count (descending)
    sorted_categories = sorted(category_counts.items(), key=lambda x: x[1], reverse=True)
    
    # Display top 20
    for i, (category, count) in enumerate(sorted_categories[:20], 1):
        print(f"{i:2d}. {category:<40} {count:>4} files")
    
    if len(sorted_categories) > 20:
        remaining = len(sorted_categories) - 20
        print(f"\n... and {remaining} more categories")

    # Write tips on uncategorized workflows
    print("\n" + "="*50)
    print("Tips on uncategorized workflows")
    print("="*50)
    print("1. At the search, you'll be able to list all uncategorized workflows.")
    print("2. If the workflow JSON filename has a clear service name (eg. Twilio), it could just be we are missing its category definition at context/def_categories.json.")
    print("3. You can contribute to the category definitions and then make a pull request to help improve the search experience.")


    # Done message
    print("\n" + "="*50)
    print("Done! Search re-indexed with categories.")
    print("="*50)

if __name__ == "__main__":
    main()
feat: Add categorization script that indexes the workflows with category for category search 2025-06-29 03:08:41 -07:00			`import json`
			`import os`
			`from pathlib import Path`
fix: recursive indexing and normalization in create_categories.py & workflow_db.py (fixes #82) fix: recursive indexing and normalization in create_categories.py & workflow_db.py (fixes #82) 2025-08-12 10:34:22 +05:30			`import glob`
			`import re`
feat: Add categorization script that indexes the workflows with category for category search 2025-06-29 03:08:41 -07:00
			`def load_def_categories():`
			`"""Load the definition categories from def_categories.json"""`
			`def_categories_path = Path("context/def_categories.json")`
			`with open(def_categories_path, 'r', encoding='utf-8') as f:`
fix: recursive indexing and normalization in create_categories.py & workflow_db.py (fixes #82) fix: recursive indexing and normalization in create_categories.py & workflow_db.py (fixes #82) 2025-08-12 10:34:22 +05:30			`raw_map = json.load(f)`

			`# Normalize keys: strip non-alphanumerics and lowercase`
			`integration_to_category = {`
			`re.sub(r"[^a-z0-9]", "", item["integration"].lower()): item["category"]`
			`for item in raw_map`
			`}`
feat: Add categorization script that indexes the workflows with category for category search 2025-06-29 03:08:41 -07:00			`return integration_to_category`

			`def extract_tokens_from_filename(filename):`
			`"""Extract tokens from filename by splitting on '_' and removing '.json'"""`
			`# Remove .json extension`
			`name_without_ext = filename.replace('.json', '')`

			`# Split by underscore`
			`tokens = name_without_ext.split('_')`

			`# Convert to lowercase for matching`
			`tokens = [token.lower() for token in tokens if token]`

			`return tokens`

			`def find_matching_category(tokens, integration_to_category):`
			`"""Find the first matching category for the given tokens"""`
			`for token in tokens:`
fix: recursive indexing and normalization in create_categories.py & workflow_db.py (fixes #82) fix: recursive indexing and normalization in create_categories.py & workflow_db.py (fixes #82) 2025-08-12 10:34:22 +05:30			`# Normalize token same as keys`
			`norm = re.sub(r"[^a-z0-9]", "", token.lower())`
			`if norm in integration_to_category:`
			`return integration_to_category[norm]`
feat: Add categorization script that indexes the workflows with category for category search 2025-06-29 03:08:41 -07:00
			`# Try partial matches for common variations`
			`for token in tokens:`
fix: recursive indexing and normalization in create_categories.py & workflow_db.py (fixes #82) fix: recursive indexing and normalization in create_categories.py & workflow_db.py (fixes #82) 2025-08-12 10:34:22 +05:30			`norm = re.sub(r"[^a-z0-9]", "", token.lower())`
			`for integration_key in integration_to_category:`
			`if norm in integration_key or integration_key in norm:`
			`return integration_to_category[integration_key]`
feat: Add categorization script that indexes the workflows with category for category search 2025-06-29 03:08:41 -07:00
			`return ""`

refactor: merge categorization scripts Merge the logic from categorize_workflows.py into create_categories.py to simplify the categorization process. The categorize_workflows.py script is now deleted. 2025-08-12 14:29:36 +05:30			`def categorize_by_filename(filename):`
			`"""`
			`Categorize workflow based on filename patterns.`
			`Returns the most likely category or None if uncertain.`
			`"""`
			`filename_lower = filename.lower()`

			`# Security & Authentication`
			`if any(word in filename_lower for word in ['totp', 'bitwarden', 'auth', 'security']):`
			`return "Technical Infrastructure & DevOps"`

			`# Data Processing & File Operations`
feat: Add GitHub Pages public search interface and enhanced documentation system ## 🌐 GitHub Pages Public Search Interface - Complete client-side search application solving Issue #84 - Responsive HTML/CSS/JavaScript with mobile optimization - Real-time search across 2,057+ workflows with instant results - Category filtering across 15 workflow categories - Dark/light theme support with system preference detection - Direct workflow JSON download functionality ## 🤖 GitHub Actions Automation - deploy-pages.yml: Automated deployment to GitHub Pages - update-readme.yml: Weekly automated README statistics updates - Comprehensive workflow indexing and category generation ## 🔍 Enhanced Search & Categorization - Static search index generation for GitHub Pages - Developer-chosen category prioritization system - CalcsLive custom node integration and categorization - Enhanced workflow database with better custom node detection - Fixed README corruption with live database statistics ## 📚 Documentation & Infrastructure - Comprehensive CHANGELOG.md with proper versioning - Enhanced README with accurate statistics and public interface links - Professional documentation solving repository infrastructure needs ## Technical Improvements - Fixed Unicode encoding issues in Python scripts - Enhanced CalcsLive detection with false positive prevention - Improved JSON description preservation and indexing - Mobile-optimized responsive design for all devices 🤖 Generated with Claude Code Co-Authored-By: Claude <noreply@anthropic.com> 2025-09-29 21:54:12 -07:00			`if any(word in filename_lower for word in ['process', 'writebinaryfile', 'readbinaryfile', 'extractfromfile', 'converttofile', 'googlefirebasecloudfirestore', 'supabase', 'surveymonkey', 'renamekeys', 'readpdf', 'wufoo', 'splitinbatches', 'airtop', 'comparedatasets', 'spreadsheetfile', 'calcslive']):`
refactor: merge categorization scripts Merge the logic from categorize_workflows.py into create_categories.py to simplify the categorization process. The categorize_workflows.py script is now deleted. 2025-08-12 14:29:36 +05:30			`return "Data Processing & Analysis"`

			`# Utility & Business Process Automation`
			`if any(word in filename_lower for word in ['noop', 'code', 'schedule', 'filter', 'splitout', 'wait', 'limit', 'aggregate', 'acuityscheduling', 'eventbrite', 'philipshue', 'stickynote', 'n8ntrainingcustomerdatastore', 'n8n']):`
			`return "Business Process Automation"`

			`# Webhook & API related`
			`if any(word in filename_lower for word in ['webhook', 'respondtowebhook', 'http', 'rssfeedread']):`
			`return "Web Scraping & Data Extraction"`

			`# Form & Data Collection`
			`if any(word in filename_lower for word in ['form', 'typeform', 'jotform']):`
			`return "Data Processing & Analysis"`

			`# Local file operations`
			`if any(word in filename_lower for word in ['localfile', 'filemaker']):`
			`return "Cloud Storage & File Management"`

			`# Database operations`
			`if any(word in filename_lower for word in ['postgres', 'mysql', 'mongodb', 'redis', 'elasticsearch', 'snowflake']):`
			`return "Data Processing & Analysis"`

			`# AI & Machine Learning`
			`if any(word in filename_lower for word in ['openai', 'awstextract', 'awsrekognition', 'humanticai', 'openthesaurus', 'googletranslate', 'summarize']):`
			`return "AI Agent Development"`

			`# E-commerce specific`
			`if any(word in filename_lower for word in ['woocommerce', 'gumroad']):`
			`return "E-commerce & Retail"`

			`# Social media specific`
			`if any(word in filename_lower for word in ['facebook', 'linkedin', 'instagram']):`
			`return "Social Media Management"`

			`# Customer support`
			`if any(word in filename_lower for word in ['zendesk', 'intercom', 'drift', 'pagerduty']):`
			`return "Communication & Messaging"`

			`# Analytics & Tracking`
			`if any(word in filename_lower for word in ['googleanalytics', 'segment', 'mixpanel']):`
			`return "Data Processing & Analysis"`

			`# Development tools`
			`if any(word in filename_lower for word in ['git', 'github', 'gitlab', 'travisci', 'jenkins', 'uptimerobot', 'gsuiteadmin', 'debughelper', 'bitbucket']):`
			`return "Technical Infrastructure & DevOps"`

			`# CRM & Sales tools`
			`if any(word in filename_lower for word in ['pipedrive', 'hubspot', 'salesforce', 'copper', 'orbit', 'agilecrm']):`
			`return "CRM & Sales"`

			`# Marketing tools`
			`if any(word in filename_lower for word in ['mailchimp', 'convertkit', 'sendgrid', 'mailerlite', 'lemlist', 'sendy', 'postmark', 'mailgun']):`
			`return "Marketing & Advertising Automation"`

			`# Project management`
			`if any(word in filename_lower for word in ['asana', 'mondaycom', 'clickup', 'trello', 'notion', 'toggl', 'microsofttodo', 'calendly', 'jira']):`
			`return "Project Management"`

			`# Communication`
			`if any(word in filename_lower for word in ['slack', 'telegram', 'discord', 'mattermost', 'twilio', 'emailreadimap', 'teams', 'gotowebinar']):`
			`return "Communication & Messaging"`

			`# Cloud storage`
			`if any(word in filename_lower for word in ['dropbox', 'googledrive', 'onedrive', 'awss3', 'googledocs']):`
			`return "Cloud Storage & File Management"`

			`# Creative tools`
			`if any(word in filename_lower for word in ['canva', 'figma', 'bannerbear', 'editimage']):`
			`return "Creative Design Automation"`

			`# Video & content`
			`if any(word in filename_lower for word in ['youtube', 'vimeo', 'storyblok', 'strapi']):`
			`return "Creative Content & Video Automation"`

			`# Financial tools`
			`if any(word in filename_lower for word in ['stripe', 'chargebee', 'quickbooks', 'harvest']):`
			`return "Financial & Accounting"`

			`# Weather & external APIs`
			`if any(word in filename_lower for word in ['openweathermap', 'nasa', 'crypto', 'coingecko']):`
			`return "Web Scraping & Data Extraction"`

			`return ""`

feat: Add categorization script that indexes the workflows with category for category search 2025-06-29 03:08:41 -07:00			`def main():`
			`# Load definition categories`
			`integration_to_category = load_def_categories()`

			`# Get all JSON files from workflows directory`
			`workflows_dir = Path("workflows")`
fix: recursive indexing and normalization in create_categories.py & workflow_db.py (fixes #82) fix: recursive indexing and normalization in create_categories.py & workflow_db.py (fixes #82) 2025-08-12 10:34:22 +05:30			`json_files = glob.glob(`
			`os.path.join(workflows_dir, "*", ".json"),`
			`recursive=True`
			`)`
feat: Add categorization script that indexes the workflows with category for category search 2025-06-29 03:08:41 -07:00
			`# Process each file`
			`search_categories = []`

			`for json_file in json_files:`
fix: recursive indexing and normalization in create_categories.py & workflow_db.py (fixes #82) fix: recursive indexing and normalization in create_categories.py & workflow_db.py (fixes #82) 2025-08-12 10:34:22 +05:30			`path_obj = Path(json_file)`
			`filename = path_obj.name`
feat: Add categorization script that indexes the workflows with category for category search 2025-06-29 03:08:41 -07:00			`tokens = extract_tokens_from_filename(filename)`
			`category = find_matching_category(tokens, integration_to_category)`

			`search_categories.append({`
			`"filename": filename,`
			`"category": category`
			`})`
refactor: merge categorization scripts Merge the logic from categorize_workflows.py into create_categories.py to simplify the categorization process. The categorize_workflows.py script is now deleted. 2025-08-12 14:29:36 +05:30
			`# Second pass for categorization`
			`for item in search_categories:`
			`if not item['category']:`
			`item['category'] = categorize_by_filename(item['filename'])`
feat: Add categorization script that indexes the workflows with category for category search 2025-06-29 03:08:41 -07:00
			`# Sort by filename for consistency`
			`search_categories.sort(key=lambda x: x['filename'])`

			`# Write to search_categories.json`
			`output_path = Path("context/search_categories.json")`
			`with open(output_path, 'w', encoding='utf-8') as f:`
			`json.dump(search_categories, f, indent=2, ensure_ascii=False)`

			`print(f"Generated search_categories.json with {len(search_categories)} entries")`

feat: Add category filter to search interface 2025-06-29 03:26:18 -07:00			`# Generate unique categories list for API`
			`unique_categories = set()`
			`for item in search_categories:`
			`if item['category']:`
			`unique_categories.add(item['category'])`

			`# Always include 'Uncategorized' for workflows without categories`
			`unique_categories.add('Uncategorized')`

			`# Sort categories alphabetically`
			`categories_list = sorted(list(unique_categories))`

			`# Write unique categories to a separate file for API consumption`
			`categories_output_path = Path("context/unique_categories.json")`
			`with open(categories_output_path, 'w', encoding='utf-8') as f:`
			`json.dump(categories_list, f, indent=2, ensure_ascii=False)`

			`print(f"Generated unique_categories.json with {len(categories_list)} categories")`

feat: Add categorization script that indexes the workflows with category for category search 2025-06-29 03:08:41 -07:00			`# Print some statistics`
			`categorized = sum(1 for item in search_categories if item['category'])`
			`uncategorized = len(search_categories) - categorized`
			`print(f"Categorized: {categorized}, Uncategorized: {uncategorized}")`

			`# Print detailed category statistics`
			`print("\n" + "="*50)`
			`print("CATEGORY DISTRIBUTION (Top 20)")`
			`print("="*50)`

			`# Count categories`
			`category_counts = {}`
			`for item in search_categories:`
			`category = item['category'] if item['category'] else "Uncategorized"`
			`category_counts[category] = category_counts.get(category, 0) + 1`

			`# Sort by count (descending)`
			`sorted_categories = sorted(category_counts.items(), key=lambda x: x[1], reverse=True)`

			`# Display top 20`
			`for i, (category, count) in enumerate(sorted_categories[:20], 1):`
			`print(f"{i:2d}. {category:<40} {count:>4} files")`

			`if len(sorted_categories) > 20:`
			`remaining = len(sorted_categories) - 20`
			`print(f"\n... and {remaining} more categories")`

			`# Write tips on uncategorized workflows`
			`print("\n" + "="*50)`
			`print("Tips on uncategorized workflows")`
			`print("="*50)`
			`print("1. At the search, you'll be able to list all uncategorized workflows.")`
			`print("2. If the workflow JSON filename has a clear service name (eg. Twilio), it could just be we are missing its category definition at context/def_categories.json.")`
			`print("3. You can contribute to the category definitions and then make a pull request to help improve the search experience.")`


			`# Done message`
			`print("\n" + "="*50)`
			`print("Done! Search re-indexed with categories.")`
			`print("="*50)`

			`if __name__ == "__main__":`
refactor: merge categorization scripts Merge the logic from categorize_workflows.py into create_categories.py to simplify the categorization process. The categorize_workflows.py script is now deleted. 2025-08-12 14:29:36 +05:30			`main()`