refactor: merge categorization scripts

Merge the logic from categorize_workflows.py into create_categories.py to simplify the categorization process. The categorize_workflows.py script is now deleted.
2025-11-25 19:37:52 +08:00 · 2025-08-12 14:29:36 +05:30
parent 6b641ec14f
commit 7d6e4e2222
4 changed files with 328 additions and 400 deletions
--- a/categorize_workflows.py
+++ b/categorize_workflows.py
@@ -1,216 +0,0 @@
-#!/usr/bin/env python3
-"""
-Script to categorize uncategorized n8n workflows based on filename patterns.
-This will help reduce the count of uncategorized workflows.
-"""
-
-import json
-from collections import defaultdict
-
-def load_categories():
-    """Load the search categories file."""
-    with open('context/search_categories.json', 'r', encoding='utf-8') as f:
-        return json.load(f)
-
-def load_unique_categories():
-    """Load the unique categories list."""
-    with open('context/unique_categories.json', 'r', encoding='utf-8') as f:
-        return json.load(f)
-
-def categorize_by_filename(filename):
-    """
-    Categorize workflow based on filename patterns.
-    Returns the most likely category or None if uncertain.
-    """
-    filename_lower = filename.lower()
-    
-    # Security & Authentication
-    if any(word in filename_lower for word in ['totp', 'bitwarden', 'auth', 'security']):
-        return "Technical Infrastructure & DevOps"
-    
-    # Data Processing & File Operations
-    if any(word in filename_lower for word in ['process', 'writebinaryfile', 'readbinaryfile', 'extractfromfile', 'converttofile']):
-        return "Data Processing & Analysis"
-    
-    # Utility & Business Process Automation
-    if any(word in filename_lower for word in ['noop', 'code', 'schedule', 'filter', 'splitout', 'wait', 'limit', 'aggregate']):
-        return "Business Process Automation"
-    
-    # Webhook & API related
-    if any(word in filename_lower for word in ['webhook', 'respondtowebhook', 'http']):
-        return "Web Scraping & Data Extraction"
-    
-    # Form & Data Collection
-    if any(word in filename_lower for word in ['form', 'typeform', 'jotform']):
-        return "Data Processing & Analysis"
-    
-    # Local file operations
-    if any(word in filename_lower for word in ['localfile', 'filemaker']):
-        return "Cloud Storage & File Management"
-    
-    # Database operations
-    if any(word in filename_lower for word in ['postgres', 'mysql', 'mongodb', 'redis', 'elasticsearch', 'snowflake']):
-        return "Data Processing & Analysis"
-    
-    # AI & Machine Learning
-    if any(word in filename_lower for word in ['openai', 'awstextract', 'awsrekognition', 'humanticai', 'openthesaurus']):
-        return "AI Agent Development"
-    
-    # E-commerce specific
-    if any(word in filename_lower for word in ['woocommerce', 'gumroad']):
-        return "E-commerce & Retail"
-    
-    # Social media specific
-    if any(word in filename_lower for word in ['facebook', 'linkedin', 'instagram']):
-        return "Social Media Management"
-    
-    # Customer support
-    if any(word in filename_lower for word in ['zendesk', 'intercom', 'drift', 'pagerduty']):
-        return "Communication & Messaging"
-    
-    # Analytics & Tracking
-    if any(word in filename_lower for word in ['googleanalytics', 'segment', 'mixpanel']):
-        return "Data Processing & Analysis"
-    
-    # Development tools
-    if any(word in filename_lower for word in ['git', 'github', 'gitlab', 'travisci', 'jenkins']):
-        return "Technical Infrastructure & DevOps"
-    
-    # CRM & Sales tools
-    if any(word in filename_lower for word in ['pipedrive', 'hubspot', 'salesforce', 'copper', 'orbit']):
-        return "CRM & Sales"
-    
-    # Marketing tools
-    if any(word in filename_lower for word in ['mailchimp', 'convertkit', 'sendgrid', 'mailerlite', 'lemlist']):
-        return "Marketing & Advertising Automation"
-    
-    # Project management
-    if any(word in filename_lower for word in ['asana', 'mondaycom', 'clickup', 'trello', 'notion']):
-        return "Project Management"
-    
-    # Communication
-    if any(word in filename_lower for word in ['slack', 'telegram', 'discord', 'mattermost', 'twilio']):
-        return "Communication & Messaging"
-    
-    # Cloud storage
-    if any(word in filename_lower for word in ['dropbox', 'googledrive', 'onedrive', 'awss3']):
-        return "Cloud Storage & File Management"
-    
-    # Creative tools
-    if any(word in filename_lower for word in ['canva', 'figma', 'bannerbear', 'editimage']):
-        return "Creative Design Automation"
-    
-    # Video & content
-    if any(word in filename_lower for word in ['youtube', 'vimeo', 'storyblok', 'strapi']):
-        return "Creative Content & Video Automation"
-    
-    # Financial tools
-    if any(word in filename_lower for word in ['stripe', 'chargebee', 'quickbooks', 'harvest']):
-        return "Financial & Accounting"
-    
-    # Weather & external APIs
-    if any(word in filename_lower for word in ['openweathermap', 'nasa', 'crypto', 'coingecko']):
-        return "Web Scraping & Data Extraction"
-    
-    return None
-
-def main():
-    """Main function to categorize workflows."""
-    print("Loading workflow categories...")
-    workflows = load_categories()
-    unique_categories = load_unique_categories()
-    
-    print(f"Total workflows: {len(workflows)}")
-    
-    # Count current categories
-    category_counts = defaultdict(int)
-    uncategorized_count = 0
-    
-    for workflow in workflows:
-        if workflow['category']:
-            category_counts[workflow['category']] += 1
-        else:
-            uncategorized_count += 1
-    
-    print(f"\nCurrent category distribution:")
-    for category, count in sorted(category_counts.items()):
-        print(f"  {category}: {count}")
-    print(f"  Uncategorized: {uncategorized_count}")
-    
-    # Identify uncategorized workflows
-    uncategorized_workflows = [w for w in workflows if not w['category']]
-    
-    print(f"\nAnalyzing {len(uncategorized_workflows)} uncategorized workflows...")
-    
-    # Categorize based on filename patterns
-    suggested_categories = {}
-    uncertain_workflows = []
-    
-    for workflow in uncategorized_workflows:
-        filename = workflow['filename']
-        suggested_category = categorize_by_filename(filename)
-        
-        if suggested_category:
-            suggested_categories[filename] = suggested_category
-        else:
-            uncertain_workflows.append(filename)
-    
-    print(f"\nSuggested categorizations: {len(suggested_categories)}")
-    print(f"Still uncertain: {len(uncategorized_workflows)}")
-    
-    # Show suggested categorizations
-    if suggested_categories:
-        print("\nSuggested categorizations:")
-        for filename, category in sorted(suggested_categories.items()):
-            print(f"  {filename} → {category}")
-    
-    # Show uncertain workflows
-    if uncertain_workflows:
-        print(f"\nWorkflows that need manual review:")
-        for filename in sorted(uncertain_workflows):
-            print(f"  {filename}")
-    
-    # Calculate potential improvement
-    potential_categorized = len(suggested_categories)
-    new_uncategorized_count = uncategorized_count - potential_categorized
-    
-    print(f"\nPotential improvement:")
-    print(f"  Current uncategorized: {uncategorized_count}")
-    print(f"  After auto-categorization: {new_uncategorized_count}")
-    print(f"  Reduction: {potential_categorized} workflows ({potential_categorized/uncategorized_count*100:.1f}%)")
-    
-    # Ask if user wants to apply suggestions
-    if suggested_categories:
-        response = input(f"\nWould you like to apply these {len(suggested_categories)} suggested categorizations? (y/n): ")
-        
-        if response.lower() in ['y', 'yes']:
-            # Apply the categorizations
-            for workflow in workflows:
-                if workflow['filename'] in suggested_categories:
-                    workflow['category'] = suggested_categories[workflow['filename']]
-            
-            # Save the updated file
-            with open('context/search_categories.json', 'w', encoding='utf-8') as f:
-                json.dump(workflows, f, indent=2, ensure_ascii=False)
-            
-            print("✅ Categorizations applied and saved!")
-            
-            # Show new distribution
-            new_category_counts = defaultdict(int)
-            new_uncategorized_count = 0
-            
-            for workflow in workflows:
-                if workflow['category']:
-                    new_category_counts[workflow['category']] += 1
-                else:
-                    new_uncategorized_count += 1
-            
-            print(f"\nNew category distribution:")
-            for category, count in sorted(new_category_counts.items()):
-                print(f"  {category}: {count}")
-            print(f"  Uncategorized: {new_uncategorized_count}")
-        else:
-            print("No changes applied.")
-
-if __name__ == "__main__":
-    main()
--- a/context/search_categories.json
+++ b/context/search_categories.json
--- a/create_categories.py
+++ b/create_categories.py
@@ -47,6 +47,103 @@ def find_matching_category(tokens, integration_to_category):
    
    return ""

+def categorize_by_filename(filename):
+    """
+    Categorize workflow based on filename patterns.
+    Returns the most likely category or None if uncertain.
+    """
+    filename_lower = filename.lower()
+    
+    # Security & Authentication
+    if any(word in filename_lower for word in ['totp', 'bitwarden', 'auth', 'security']):
+        return "Technical Infrastructure & DevOps"
+
+    # Data Processing & File Operations
+    if any(word in filename_lower for word in ['process', 'writebinaryfile', 'readbinaryfile', 'extractfromfile', 'converttofile', 'googlefirebasecloudfirestore', 'supabase', 'surveymonkey', 'renamekeys', 'readpdf', 'wufoo', 'splitinbatches', 'airtop', 'comparedatasets', 'spreadsheetfile']):
+        return "Data Processing & Analysis"
+
+    # Utility & Business Process Automation
+    if any(word in filename_lower for word in ['noop', 'code', 'schedule', 'filter', 'splitout', 'wait', 'limit', 'aggregate', 'acuityscheduling', 'eventbrite', 'philipshue', 'stickynote', 'n8ntrainingcustomerdatastore', 'n8n']):
+        return "Business Process Automation"
+
+    # Webhook & API related
+    if any(word in filename_lower for word in ['webhook', 'respondtowebhook', 'http', 'rssfeedread']):
+        return "Web Scraping & Data Extraction"
+
+    # Form & Data Collection
+    if any(word in filename_lower for word in ['form', 'typeform', 'jotform']):
+        return "Data Processing & Analysis"
+
+    # Local file operations
+    if any(word in filename_lower for word in ['localfile', 'filemaker']):
+        return "Cloud Storage & File Management"
+
+    # Database operations
+    if any(word in filename_lower for word in ['postgres', 'mysql', 'mongodb', 'redis', 'elasticsearch', 'snowflake']):
+        return "Data Processing & Analysis"
+
+    # AI & Machine Learning
+    if any(word in filename_lower for word in ['openai', 'awstextract', 'awsrekognition', 'humanticai', 'openthesaurus', 'googletranslate', 'summarize']):
+        return "AI Agent Development"
+
+    # E-commerce specific
+    if any(word in filename_lower for word in ['woocommerce', 'gumroad']):
+        return "E-commerce & Retail"
+
+    # Social media specific
+    if any(word in filename_lower for word in ['facebook', 'linkedin', 'instagram']):
+        return "Social Media Management"
+
+    # Customer support
+    if any(word in filename_lower for word in ['zendesk', 'intercom', 'drift', 'pagerduty']):
+        return "Communication & Messaging"
+
+    # Analytics & Tracking
+    if any(word in filename_lower for word in ['googleanalytics', 'segment', 'mixpanel']):
+        return "Data Processing & Analysis"
+
+    # Development tools
+    if any(word in filename_lower for word in ['git', 'github', 'gitlab', 'travisci', 'jenkins', 'uptimerobot', 'gsuiteadmin', 'debughelper', 'bitbucket']):
+        return "Technical Infrastructure & DevOps"
+
+    # CRM & Sales tools
+    if any(word in filename_lower for word in ['pipedrive', 'hubspot', 'salesforce', 'copper', 'orbit', 'agilecrm']):
+        return "CRM & Sales"
+
+    # Marketing tools
+    if any(word in filename_lower for word in ['mailchimp', 'convertkit', 'sendgrid', 'mailerlite', 'lemlist', 'sendy', 'postmark', 'mailgun']):
+        return "Marketing & Advertising Automation"
+
+    # Project management
+    if any(word in filename_lower for word in ['asana', 'mondaycom', 'clickup', 'trello', 'notion', 'toggl', 'microsofttodo', 'calendly', 'jira']):
+        return "Project Management"
+
+    # Communication
+    if any(word in filename_lower for word in ['slack', 'telegram', 'discord', 'mattermost', 'twilio', 'emailreadimap', 'teams', 'gotowebinar']):
+        return "Communication & Messaging"
+
+    # Cloud storage
+    if any(word in filename_lower for word in ['dropbox', 'googledrive', 'onedrive', 'awss3', 'googledocs']):
+        return "Cloud Storage & File Management"
+
+    # Creative tools
+    if any(word in filename_lower for word in ['canva', 'figma', 'bannerbear', 'editimage']):
+        return "Creative Design Automation"
+
+    # Video & content
+    if any(word in filename_lower for word in ['youtube', 'vimeo', 'storyblok', 'strapi']):
+        return "Creative Content & Video Automation"
+
+    # Financial tools
+    if any(word in filename_lower for word in ['stripe', 'chargebee', 'quickbooks', 'harvest']):
+        return "Financial & Accounting"
+
+    # Weather & external APIs
+    if any(word in filename_lower for word in ['openweathermap', 'nasa', 'crypto', 'coingecko']):
+        return "Web Scraping & Data Extraction"
+
+    return ""
+
 def main():
    # Load definition categories
    integration_to_category = load_def_categories()
@@ -72,6 +169,11 @@ def main():
            "category": category
        })

+    # Second pass for categorization
+    for item in search_categories:
+        if not item['category']:
+            item['category'] = categorize_by_filename(item['filename'])
+    
    # Sort by filename for consistency
    search_categories.sort(key=lambda x: x['filename'])
    
--- a/import_workflows.py
+++ b/import_workflows.py
@@ -10,6 +10,21 @@ import sys
 from pathlib import Path
 from typing import List, Dict, Any

+from categorize_workflows import categorize_by_filename
+
+
+def load_categories():
+    """Load the search categories file."""
+    try:
+        with open('context/search_categories.json', 'r', encoding='utf-8') as f:
+            return json.load(f)
+    except (FileNotFoundError, json.JSONDecodeError):
+        return []
+
+def save_categories(data):
+    """Save the search categories file."""
+    with open('context/search_categories.json', 'w', encoding='utf-8') as f:
+        json.dump(data, f, indent=2, ensure_ascii=False)

 class WorkflowImporter:
    """Import n8n workflows with progress tracking and error handling."""
@@ -56,6 +71,32 @@ class WorkflowImporter:
            
            if result.returncode == 0:
                print(f"✅ Imported: {file_path.name}")
+                
+                # Categorize the workflow and update search_categories.json
+                suggested_category = categorize_by_filename(file_path.name)
+                
+                all_workflows_data = load_categories()
+                
+                found = False
+                for workflow_entry in all_workflows_data:
+                    if workflow_entry.get('filename') == file_path.name:
+                        workflow_entry['category'] = suggested_category
+                        found = True
+                        break
+                
+                if not found:
+                    # Add new workflow entry if not found (e.g., first import)
+                    all_workflows_data.append({
+                        "filename": file_path.name,
+                        "category": suggested_category,
+                        "name": file_path.stem, # Assuming workflow name is filename without extension
+                        "description": "", # Placeholder, can be updated manually
+                        "nodes": [] # Placeholder, can be updated manually
+                    })
+                
+                save_categories(all_workflows_data)
+                print(f"  Categorized '{file_path.name}' as '{suggested_category or 'Uncategorized'}'")
+                
                return True
            else:
                error_msg = result.stderr.strip() or result.stdout.strip()
@@ -141,6 +182,7 @@ def check_n8n_available() -> bool:

 def main():
    """Main entry point."""
+    sys.stdout.reconfigure(encoding='utf-8')
    print("🔧 N8N Workflow Importer")
    print("=" * 40)