Reduced the uncategorised workflows by 85%

2025-11-25 03:15:25 +08:00 · 2025-08-12 12:04:03 +05:30
parent 6dc031c576
commit 6b641ec14f
2 changed files with 958 additions and 742 deletions
--- a/categorize_workflows.py
+++ b/categorize_workflows.py
@@ -0,0 +1,216 @@
+#!/usr/bin/env python3
+"""
+Script to categorize uncategorized n8n workflows based on filename patterns.
+This will help reduce the count of uncategorized workflows.
+"""
+
+import json
+from collections import defaultdict
+
+def load_categories():
+    """Load the search categories file."""
+    with open('context/search_categories.json', 'r', encoding='utf-8') as f:
+        return json.load(f)
+
+def load_unique_categories():
+    """Load the unique categories list."""
+    with open('context/unique_categories.json', 'r', encoding='utf-8') as f:
+        return json.load(f)
+
+def categorize_by_filename(filename):
+    """
+    Categorize workflow based on filename patterns.
+    Returns the most likely category or None if uncertain.
+    """
+    filename_lower = filename.lower()
+    
+    # Security & Authentication
+    if any(word in filename_lower for word in ['totp', 'bitwarden', 'auth', 'security']):
+        return "Technical Infrastructure & DevOps"
+    
+    # Data Processing & File Operations
+    if any(word in filename_lower for word in ['process', 'writebinaryfile', 'readbinaryfile', 'extractfromfile', 'converttofile']):
+        return "Data Processing & Analysis"
+    
+    # Utility & Business Process Automation
+    if any(word in filename_lower for word in ['noop', 'code', 'schedule', 'filter', 'splitout', 'wait', 'limit', 'aggregate']):
+        return "Business Process Automation"
+    
+    # Webhook & API related
+    if any(word in filename_lower for word in ['webhook', 'respondtowebhook', 'http']):
+        return "Web Scraping & Data Extraction"
+    
+    # Form & Data Collection
+    if any(word in filename_lower for word in ['form', 'typeform', 'jotform']):
+        return "Data Processing & Analysis"
+    
+    # Local file operations
+    if any(word in filename_lower for word in ['localfile', 'filemaker']):
+        return "Cloud Storage & File Management"
+    
+    # Database operations
+    if any(word in filename_lower for word in ['postgres', 'mysql', 'mongodb', 'redis', 'elasticsearch', 'snowflake']):
+        return "Data Processing & Analysis"
+    
+    # AI & Machine Learning
+    if any(word in filename_lower for word in ['openai', 'awstextract', 'awsrekognition', 'humanticai', 'openthesaurus']):
+        return "AI Agent Development"
+    
+    # E-commerce specific
+    if any(word in filename_lower for word in ['woocommerce', 'gumroad']):
+        return "E-commerce & Retail"
+    
+    # Social media specific
+    if any(word in filename_lower for word in ['facebook', 'linkedin', 'instagram']):
+        return "Social Media Management"
+    
+    # Customer support
+    if any(word in filename_lower for word in ['zendesk', 'intercom', 'drift', 'pagerduty']):
+        return "Communication & Messaging"
+    
+    # Analytics & Tracking
+    if any(word in filename_lower for word in ['googleanalytics', 'segment', 'mixpanel']):
+        return "Data Processing & Analysis"
+    
+    # Development tools
+    if any(word in filename_lower for word in ['git', 'github', 'gitlab', 'travisci', 'jenkins']):
+        return "Technical Infrastructure & DevOps"
+    
+    # CRM & Sales tools
+    if any(word in filename_lower for word in ['pipedrive', 'hubspot', 'salesforce', 'copper', 'orbit']):
+        return "CRM & Sales"
+    
+    # Marketing tools
+    if any(word in filename_lower for word in ['mailchimp', 'convertkit', 'sendgrid', 'mailerlite', 'lemlist']):
+        return "Marketing & Advertising Automation"
+    
+    # Project management
+    if any(word in filename_lower for word in ['asana', 'mondaycom', 'clickup', 'trello', 'notion']):
+        return "Project Management"
+    
+    # Communication
+    if any(word in filename_lower for word in ['slack', 'telegram', 'discord', 'mattermost', 'twilio']):
+        return "Communication & Messaging"
+    
+    # Cloud storage
+    if any(word in filename_lower for word in ['dropbox', 'googledrive', 'onedrive', 'awss3']):
+        return "Cloud Storage & File Management"
+    
+    # Creative tools
+    if any(word in filename_lower for word in ['canva', 'figma', 'bannerbear', 'editimage']):
+        return "Creative Design Automation"
+    
+    # Video & content
+    if any(word in filename_lower for word in ['youtube', 'vimeo', 'storyblok', 'strapi']):
+        return "Creative Content & Video Automation"
+    
+    # Financial tools
+    if any(word in filename_lower for word in ['stripe', 'chargebee', 'quickbooks', 'harvest']):
+        return "Financial & Accounting"
+    
+    # Weather & external APIs
+    if any(word in filename_lower for word in ['openweathermap', 'nasa', 'crypto', 'coingecko']):
+        return "Web Scraping & Data Extraction"
+    
+    return None
+
+def main():
+    """Main function to categorize workflows."""
+    print("Loading workflow categories...")
+    workflows = load_categories()
+    unique_categories = load_unique_categories()
+    
+    print(f"Total workflows: {len(workflows)}")
+    
+    # Count current categories
+    category_counts = defaultdict(int)
+    uncategorized_count = 0
+    
+    for workflow in workflows:
+        if workflow['category']:
+            category_counts[workflow['category']] += 1
+        else:
+            uncategorized_count += 1
+    
+    print(f"\nCurrent category distribution:")
+    for category, count in sorted(category_counts.items()):
+        print(f"  {category}: {count}")
+    print(f"  Uncategorized: {uncategorized_count}")
+    
+    # Identify uncategorized workflows
+    uncategorized_workflows = [w for w in workflows if not w['category']]
+    
+    print(f"\nAnalyzing {len(uncategorized_workflows)} uncategorized workflows...")
+    
+    # Categorize based on filename patterns
+    suggested_categories = {}
+    uncertain_workflows = []
+    
+    for workflow in uncategorized_workflows:
+        filename = workflow['filename']
+        suggested_category = categorize_by_filename(filename)
+        
+        if suggested_category:
+            suggested_categories[filename] = suggested_category
+        else:
+            uncertain_workflows.append(filename)
+    
+    print(f"\nSuggested categorizations: {len(suggested_categories)}")
+    print(f"Still uncertain: {len(uncategorized_workflows)}")
+    
+    # Show suggested categorizations
+    if suggested_categories:
+        print("\nSuggested categorizations:")
+        for filename, category in sorted(suggested_categories.items()):
+            print(f"  {filename} → {category}")
+    
+    # Show uncertain workflows
+    if uncertain_workflows:
+        print(f"\nWorkflows that need manual review:")
+        for filename in sorted(uncertain_workflows):
+            print(f"  {filename}")
+    
+    # Calculate potential improvement
+    potential_categorized = len(suggested_categories)
+    new_uncategorized_count = uncategorized_count - potential_categorized
+    
+    print(f"\nPotential improvement:")
+    print(f"  Current uncategorized: {uncategorized_count}")
+    print(f"  After auto-categorization: {new_uncategorized_count}")
+    print(f"  Reduction: {potential_categorized} workflows ({potential_categorized/uncategorized_count*100:.1f}%)")
+    
+    # Ask if user wants to apply suggestions
+    if suggested_categories:
+        response = input(f"\nWould you like to apply these {len(suggested_categories)} suggested categorizations? (y/n): ")
+        
+        if response.lower() in ['y', 'yes']:
+            # Apply the categorizations
+            for workflow in workflows:
+                if workflow['filename'] in suggested_categories:
+                    workflow['category'] = suggested_categories[workflow['filename']]
+            
+            # Save the updated file
+            with open('context/search_categories.json', 'w', encoding='utf-8') as f:
+                json.dump(workflows, f, indent=2, ensure_ascii=False)
+            
+            print("✅ Categorizations applied and saved!")
+            
+            # Show new distribution
+            new_category_counts = defaultdict(int)
+            new_uncategorized_count = 0
+            
+            for workflow in workflows:
+                if workflow['category']:
+                    new_category_counts[workflow['category']] += 1
+                else:
+                    new_uncategorized_count += 1
+            
+            print(f"\nNew category distribution:")
+            for category, count in sorted(new_category_counts.items()):
+                print(f"  {category}: {count}")
+            print(f"  Uncategorized: {new_uncategorized_count}")
+        else:
+            print("No changes applied.")
+
+if __name__ == "__main__":
+    main()
--- a/context/search_categories.json
+++ b/context/search_categories.json