Reduced the uncategorised workflows by 85%

This commit is contained in:
Praveen Mudalgeri
2025-08-12 12:04:03 +05:30
parent 6dc031c576
commit 6b641ec14f
2 changed files with 958 additions and 742 deletions

216
categorize_workflows.py Normal file
View File

@@ -0,0 +1,216 @@
#!/usr/bin/env python3
"""
Script to categorize uncategorized n8n workflows based on filename patterns.
This will help reduce the count of uncategorized workflows.
"""
import json
from collections import defaultdict
def load_categories():
"""Load the search categories file."""
with open('context/search_categories.json', 'r', encoding='utf-8') as f:
return json.load(f)
def load_unique_categories():
"""Load the unique categories list."""
with open('context/unique_categories.json', 'r', encoding='utf-8') as f:
return json.load(f)
def categorize_by_filename(filename):
"""
Categorize workflow based on filename patterns.
Returns the most likely category or None if uncertain.
"""
filename_lower = filename.lower()
# Security & Authentication
if any(word in filename_lower for word in ['totp', 'bitwarden', 'auth', 'security']):
return "Technical Infrastructure & DevOps"
# Data Processing & File Operations
if any(word in filename_lower for word in ['process', 'writebinaryfile', 'readbinaryfile', 'extractfromfile', 'converttofile']):
return "Data Processing & Analysis"
# Utility & Business Process Automation
if any(word in filename_lower for word in ['noop', 'code', 'schedule', 'filter', 'splitout', 'wait', 'limit', 'aggregate']):
return "Business Process Automation"
# Webhook & API related
if any(word in filename_lower for word in ['webhook', 'respondtowebhook', 'http']):
return "Web Scraping & Data Extraction"
# Form & Data Collection
if any(word in filename_lower for word in ['form', 'typeform', 'jotform']):
return "Data Processing & Analysis"
# Local file operations
if any(word in filename_lower for word in ['localfile', 'filemaker']):
return "Cloud Storage & File Management"
# Database operations
if any(word in filename_lower for word in ['postgres', 'mysql', 'mongodb', 'redis', 'elasticsearch', 'snowflake']):
return "Data Processing & Analysis"
# AI & Machine Learning
if any(word in filename_lower for word in ['openai', 'awstextract', 'awsrekognition', 'humanticai', 'openthesaurus']):
return "AI Agent Development"
# E-commerce specific
if any(word in filename_lower for word in ['woocommerce', 'gumroad']):
return "E-commerce & Retail"
# Social media specific
if any(word in filename_lower for word in ['facebook', 'linkedin', 'instagram']):
return "Social Media Management"
# Customer support
if any(word in filename_lower for word in ['zendesk', 'intercom', 'drift', 'pagerduty']):
return "Communication & Messaging"
# Analytics & Tracking
if any(word in filename_lower for word in ['googleanalytics', 'segment', 'mixpanel']):
return "Data Processing & Analysis"
# Development tools
if any(word in filename_lower for word in ['git', 'github', 'gitlab', 'travisci', 'jenkins']):
return "Technical Infrastructure & DevOps"
# CRM & Sales tools
if any(word in filename_lower for word in ['pipedrive', 'hubspot', 'salesforce', 'copper', 'orbit']):
return "CRM & Sales"
# Marketing tools
if any(word in filename_lower for word in ['mailchimp', 'convertkit', 'sendgrid', 'mailerlite', 'lemlist']):
return "Marketing & Advertising Automation"
# Project management
if any(word in filename_lower for word in ['asana', 'mondaycom', 'clickup', 'trello', 'notion']):
return "Project Management"
# Communication
if any(word in filename_lower for word in ['slack', 'telegram', 'discord', 'mattermost', 'twilio']):
return "Communication & Messaging"
# Cloud storage
if any(word in filename_lower for word in ['dropbox', 'googledrive', 'onedrive', 'awss3']):
return "Cloud Storage & File Management"
# Creative tools
if any(word in filename_lower for word in ['canva', 'figma', 'bannerbear', 'editimage']):
return "Creative Design Automation"
# Video & content
if any(word in filename_lower for word in ['youtube', 'vimeo', 'storyblok', 'strapi']):
return "Creative Content & Video Automation"
# Financial tools
if any(word in filename_lower for word in ['stripe', 'chargebee', 'quickbooks', 'harvest']):
return "Financial & Accounting"
# Weather & external APIs
if any(word in filename_lower for word in ['openweathermap', 'nasa', 'crypto', 'coingecko']):
return "Web Scraping & Data Extraction"
return None
def main():
"""Main function to categorize workflows."""
print("Loading workflow categories...")
workflows = load_categories()
unique_categories = load_unique_categories()
print(f"Total workflows: {len(workflows)}")
# Count current categories
category_counts = defaultdict(int)
uncategorized_count = 0
for workflow in workflows:
if workflow['category']:
category_counts[workflow['category']] += 1
else:
uncategorized_count += 1
print(f"\nCurrent category distribution:")
for category, count in sorted(category_counts.items()):
print(f" {category}: {count}")
print(f" Uncategorized: {uncategorized_count}")
# Identify uncategorized workflows
uncategorized_workflows = [w for w in workflows if not w['category']]
print(f"\nAnalyzing {len(uncategorized_workflows)} uncategorized workflows...")
# Categorize based on filename patterns
suggested_categories = {}
uncertain_workflows = []
for workflow in uncategorized_workflows:
filename = workflow['filename']
suggested_category = categorize_by_filename(filename)
if suggested_category:
suggested_categories[filename] = suggested_category
else:
uncertain_workflows.append(filename)
print(f"\nSuggested categorizations: {len(suggested_categories)}")
print(f"Still uncertain: {len(uncategorized_workflows)}")
# Show suggested categorizations
if suggested_categories:
print("\nSuggested categorizations:")
for filename, category in sorted(suggested_categories.items()):
print(f" {filename}{category}")
# Show uncertain workflows
if uncertain_workflows:
print(f"\nWorkflows that need manual review:")
for filename in sorted(uncertain_workflows):
print(f" {filename}")
# Calculate potential improvement
potential_categorized = len(suggested_categories)
new_uncategorized_count = uncategorized_count - potential_categorized
print(f"\nPotential improvement:")
print(f" Current uncategorized: {uncategorized_count}")
print(f" After auto-categorization: {new_uncategorized_count}")
print(f" Reduction: {potential_categorized} workflows ({potential_categorized/uncategorized_count*100:.1f}%)")
# Ask if user wants to apply suggestions
if suggested_categories:
response = input(f"\nWould you like to apply these {len(suggested_categories)} suggested categorizations? (y/n): ")
if response.lower() in ['y', 'yes']:
# Apply the categorizations
for workflow in workflows:
if workflow['filename'] in suggested_categories:
workflow['category'] = suggested_categories[workflow['filename']]
# Save the updated file
with open('context/search_categories.json', 'w', encoding='utf-8') as f:
json.dump(workflows, f, indent=2, ensure_ascii=False)
print("✅ Categorizations applied and saved!")
# Show new distribution
new_category_counts = defaultdict(int)
new_uncategorized_count = 0
for workflow in workflows:
if workflow['category']:
new_category_counts[workflow['category']] += 1
else:
new_uncategorized_count += 1
print(f"\nNew category distribution:")
for category, count in sorted(new_category_counts.items()):
print(f" {category}: {count}")
print(f" Uncategorized: {new_uncategorized_count}")
else:
print("No changes applied.")
if __name__ == "__main__":
main()

File diff suppressed because it is too large Load Diff