refactor: merge categorization scripts

Merge the logic from categorize_workflows.py into create_categories.py to simplify the categorization process. The categorize_workflows.py script is now deleted.
This commit is contained in:
Praveen Mudalgeri
2025-08-12 14:29:36 +05:30
parent 6b641ec14f
commit 7d6e4e2222
4 changed files with 328 additions and 400 deletions

View File

@@ -1,216 +0,0 @@
#!/usr/bin/env python3
"""
Script to categorize uncategorized n8n workflows based on filename patterns.
This will help reduce the count of uncategorized workflows.
"""
import json
from collections import defaultdict
def load_categories():
"""Load the search categories file."""
with open('context/search_categories.json', 'r', encoding='utf-8') as f:
return json.load(f)
def load_unique_categories():
"""Load the unique categories list."""
with open('context/unique_categories.json', 'r', encoding='utf-8') as f:
return json.load(f)
def categorize_by_filename(filename):
"""
Categorize workflow based on filename patterns.
Returns the most likely category or None if uncertain.
"""
filename_lower = filename.lower()
# Security & Authentication
if any(word in filename_lower for word in ['totp', 'bitwarden', 'auth', 'security']):
return "Technical Infrastructure & DevOps"
# Data Processing & File Operations
if any(word in filename_lower for word in ['process', 'writebinaryfile', 'readbinaryfile', 'extractfromfile', 'converttofile']):
return "Data Processing & Analysis"
# Utility & Business Process Automation
if any(word in filename_lower for word in ['noop', 'code', 'schedule', 'filter', 'splitout', 'wait', 'limit', 'aggregate']):
return "Business Process Automation"
# Webhook & API related
if any(word in filename_lower for word in ['webhook', 'respondtowebhook', 'http']):
return "Web Scraping & Data Extraction"
# Form & Data Collection
if any(word in filename_lower for word in ['form', 'typeform', 'jotform']):
return "Data Processing & Analysis"
# Local file operations
if any(word in filename_lower for word in ['localfile', 'filemaker']):
return "Cloud Storage & File Management"
# Database operations
if any(word in filename_lower for word in ['postgres', 'mysql', 'mongodb', 'redis', 'elasticsearch', 'snowflake']):
return "Data Processing & Analysis"
# AI & Machine Learning
if any(word in filename_lower for word in ['openai', 'awstextract', 'awsrekognition', 'humanticai', 'openthesaurus']):
return "AI Agent Development"
# E-commerce specific
if any(word in filename_lower for word in ['woocommerce', 'gumroad']):
return "E-commerce & Retail"
# Social media specific
if any(word in filename_lower for word in ['facebook', 'linkedin', 'instagram']):
return "Social Media Management"
# Customer support
if any(word in filename_lower for word in ['zendesk', 'intercom', 'drift', 'pagerduty']):
return "Communication & Messaging"
# Analytics & Tracking
if any(word in filename_lower for word in ['googleanalytics', 'segment', 'mixpanel']):
return "Data Processing & Analysis"
# Development tools
if any(word in filename_lower for word in ['git', 'github', 'gitlab', 'travisci', 'jenkins']):
return "Technical Infrastructure & DevOps"
# CRM & Sales tools
if any(word in filename_lower for word in ['pipedrive', 'hubspot', 'salesforce', 'copper', 'orbit']):
return "CRM & Sales"
# Marketing tools
if any(word in filename_lower for word in ['mailchimp', 'convertkit', 'sendgrid', 'mailerlite', 'lemlist']):
return "Marketing & Advertising Automation"
# Project management
if any(word in filename_lower for word in ['asana', 'mondaycom', 'clickup', 'trello', 'notion']):
return "Project Management"
# Communication
if any(word in filename_lower for word in ['slack', 'telegram', 'discord', 'mattermost', 'twilio']):
return "Communication & Messaging"
# Cloud storage
if any(word in filename_lower for word in ['dropbox', 'googledrive', 'onedrive', 'awss3']):
return "Cloud Storage & File Management"
# Creative tools
if any(word in filename_lower for word in ['canva', 'figma', 'bannerbear', 'editimage']):
return "Creative Design Automation"
# Video & content
if any(word in filename_lower for word in ['youtube', 'vimeo', 'storyblok', 'strapi']):
return "Creative Content & Video Automation"
# Financial tools
if any(word in filename_lower for word in ['stripe', 'chargebee', 'quickbooks', 'harvest']):
return "Financial & Accounting"
# Weather & external APIs
if any(word in filename_lower for word in ['openweathermap', 'nasa', 'crypto', 'coingecko']):
return "Web Scraping & Data Extraction"
return None
def main():
"""Main function to categorize workflows."""
print("Loading workflow categories...")
workflows = load_categories()
unique_categories = load_unique_categories()
print(f"Total workflows: {len(workflows)}")
# Count current categories
category_counts = defaultdict(int)
uncategorized_count = 0
for workflow in workflows:
if workflow['category']:
category_counts[workflow['category']] += 1
else:
uncategorized_count += 1
print(f"\nCurrent category distribution:")
for category, count in sorted(category_counts.items()):
print(f" {category}: {count}")
print(f" Uncategorized: {uncategorized_count}")
# Identify uncategorized workflows
uncategorized_workflows = [w for w in workflows if not w['category']]
print(f"\nAnalyzing {len(uncategorized_workflows)} uncategorized workflows...")
# Categorize based on filename patterns
suggested_categories = {}
uncertain_workflows = []
for workflow in uncategorized_workflows:
filename = workflow['filename']
suggested_category = categorize_by_filename(filename)
if suggested_category:
suggested_categories[filename] = suggested_category
else:
uncertain_workflows.append(filename)
print(f"\nSuggested categorizations: {len(suggested_categories)}")
print(f"Still uncertain: {len(uncategorized_workflows)}")
# Show suggested categorizations
if suggested_categories:
print("\nSuggested categorizations:")
for filename, category in sorted(suggested_categories.items()):
print(f" {filename}{category}")
# Show uncertain workflows
if uncertain_workflows:
print(f"\nWorkflows that need manual review:")
for filename in sorted(uncertain_workflows):
print(f" {filename}")
# Calculate potential improvement
potential_categorized = len(suggested_categories)
new_uncategorized_count = uncategorized_count - potential_categorized
print(f"\nPotential improvement:")
print(f" Current uncategorized: {uncategorized_count}")
print(f" After auto-categorization: {new_uncategorized_count}")
print(f" Reduction: {potential_categorized} workflows ({potential_categorized/uncategorized_count*100:.1f}%)")
# Ask if user wants to apply suggestions
if suggested_categories:
response = input(f"\nWould you like to apply these {len(suggested_categories)} suggested categorizations? (y/n): ")
if response.lower() in ['y', 'yes']:
# Apply the categorizations
for workflow in workflows:
if workflow['filename'] in suggested_categories:
workflow['category'] = suggested_categories[workflow['filename']]
# Save the updated file
with open('context/search_categories.json', 'w', encoding='utf-8') as f:
json.dump(workflows, f, indent=2, ensure_ascii=False)
print("✅ Categorizations applied and saved!")
# Show new distribution
new_category_counts = defaultdict(int)
new_uncategorized_count = 0
for workflow in workflows:
if workflow['category']:
new_category_counts[workflow['category']] += 1
else:
new_uncategorized_count += 1
print(f"\nNew category distribution:")
for category, count in sorted(new_category_counts.items()):
print(f" {category}: {count}")
print(f" Uncategorized: {new_uncategorized_count}")
else:
print("No changes applied.")
if __name__ == "__main__":
main()

File diff suppressed because it is too large Load Diff

View File

@@ -47,6 +47,103 @@ def find_matching_category(tokens, integration_to_category):
return ""
def categorize_by_filename(filename):
"""
Categorize workflow based on filename patterns.
Returns the most likely category or None if uncertain.
"""
filename_lower = filename.lower()
# Security & Authentication
if any(word in filename_lower for word in ['totp', 'bitwarden', 'auth', 'security']):
return "Technical Infrastructure & DevOps"
# Data Processing & File Operations
if any(word in filename_lower for word in ['process', 'writebinaryfile', 'readbinaryfile', 'extractfromfile', 'converttofile', 'googlefirebasecloudfirestore', 'supabase', 'surveymonkey', 'renamekeys', 'readpdf', 'wufoo', 'splitinbatches', 'airtop', 'comparedatasets', 'spreadsheetfile']):
return "Data Processing & Analysis"
# Utility & Business Process Automation
if any(word in filename_lower for word in ['noop', 'code', 'schedule', 'filter', 'splitout', 'wait', 'limit', 'aggregate', 'acuityscheduling', 'eventbrite', 'philipshue', 'stickynote', 'n8ntrainingcustomerdatastore', 'n8n']):
return "Business Process Automation"
# Webhook & API related
if any(word in filename_lower for word in ['webhook', 'respondtowebhook', 'http', 'rssfeedread']):
return "Web Scraping & Data Extraction"
# Form & Data Collection
if any(word in filename_lower for word in ['form', 'typeform', 'jotform']):
return "Data Processing & Analysis"
# Local file operations
if any(word in filename_lower for word in ['localfile', 'filemaker']):
return "Cloud Storage & File Management"
# Database operations
if any(word in filename_lower for word in ['postgres', 'mysql', 'mongodb', 'redis', 'elasticsearch', 'snowflake']):
return "Data Processing & Analysis"
# AI & Machine Learning
if any(word in filename_lower for word in ['openai', 'awstextract', 'awsrekognition', 'humanticai', 'openthesaurus', 'googletranslate', 'summarize']):
return "AI Agent Development"
# E-commerce specific
if any(word in filename_lower for word in ['woocommerce', 'gumroad']):
return "E-commerce & Retail"
# Social media specific
if any(word in filename_lower for word in ['facebook', 'linkedin', 'instagram']):
return "Social Media Management"
# Customer support
if any(word in filename_lower for word in ['zendesk', 'intercom', 'drift', 'pagerduty']):
return "Communication & Messaging"
# Analytics & Tracking
if any(word in filename_lower for word in ['googleanalytics', 'segment', 'mixpanel']):
return "Data Processing & Analysis"
# Development tools
if any(word in filename_lower for word in ['git', 'github', 'gitlab', 'travisci', 'jenkins', 'uptimerobot', 'gsuiteadmin', 'debughelper', 'bitbucket']):
return "Technical Infrastructure & DevOps"
# CRM & Sales tools
if any(word in filename_lower for word in ['pipedrive', 'hubspot', 'salesforce', 'copper', 'orbit', 'agilecrm']):
return "CRM & Sales"
# Marketing tools
if any(word in filename_lower for word in ['mailchimp', 'convertkit', 'sendgrid', 'mailerlite', 'lemlist', 'sendy', 'postmark', 'mailgun']):
return "Marketing & Advertising Automation"
# Project management
if any(word in filename_lower for word in ['asana', 'mondaycom', 'clickup', 'trello', 'notion', 'toggl', 'microsofttodo', 'calendly', 'jira']):
return "Project Management"
# Communication
if any(word in filename_lower for word in ['slack', 'telegram', 'discord', 'mattermost', 'twilio', 'emailreadimap', 'teams', 'gotowebinar']):
return "Communication & Messaging"
# Cloud storage
if any(word in filename_lower for word in ['dropbox', 'googledrive', 'onedrive', 'awss3', 'googledocs']):
return "Cloud Storage & File Management"
# Creative tools
if any(word in filename_lower for word in ['canva', 'figma', 'bannerbear', 'editimage']):
return "Creative Design Automation"
# Video & content
if any(word in filename_lower for word in ['youtube', 'vimeo', 'storyblok', 'strapi']):
return "Creative Content & Video Automation"
# Financial tools
if any(word in filename_lower for word in ['stripe', 'chargebee', 'quickbooks', 'harvest']):
return "Financial & Accounting"
# Weather & external APIs
if any(word in filename_lower for word in ['openweathermap', 'nasa', 'crypto', 'coingecko']):
return "Web Scraping & Data Extraction"
return ""
def main():
# Load definition categories
integration_to_category = load_def_categories()
@@ -72,6 +169,11 @@ def main():
"category": category
})
# Second pass for categorization
for item in search_categories:
if not item['category']:
item['category'] = categorize_by_filename(item['filename'])
# Sort by filename for consistency
search_categories.sort(key=lambda x: x['filename'])

View File

@@ -10,6 +10,21 @@ import sys
from pathlib import Path
from typing import List, Dict, Any
from categorize_workflows import categorize_by_filename
def load_categories():
"""Load the search categories file."""
try:
with open('context/search_categories.json', 'r', encoding='utf-8') as f:
return json.load(f)
except (FileNotFoundError, json.JSONDecodeError):
return []
def save_categories(data):
"""Save the search categories file."""
with open('context/search_categories.json', 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
class WorkflowImporter:
"""Import n8n workflows with progress tracking and error handling."""
@@ -56,6 +71,32 @@ class WorkflowImporter:
if result.returncode == 0:
print(f"✅ Imported: {file_path.name}")
# Categorize the workflow and update search_categories.json
suggested_category = categorize_by_filename(file_path.name)
all_workflows_data = load_categories()
found = False
for workflow_entry in all_workflows_data:
if workflow_entry.get('filename') == file_path.name:
workflow_entry['category'] = suggested_category
found = True
break
if not found:
# Add new workflow entry if not found (e.g., first import)
all_workflows_data.append({
"filename": file_path.name,
"category": suggested_category,
"name": file_path.stem, # Assuming workflow name is filename without extension
"description": "", # Placeholder, can be updated manually
"nodes": [] # Placeholder, can be updated manually
})
save_categories(all_workflows_data)
print(f" Categorized '{file_path.name}' as '{suggested_category or 'Uncategorized'}'")
return True
else:
error_msg = result.stderr.strip() or result.stdout.strip()
@@ -141,6 +182,7 @@ def check_n8n_available() -> bool:
def main():
"""Main entry point."""
sys.stdout.reconfigure(encoding='utf-8')
print("🔧 N8N Workflow Importer")
print("=" * 40)