Files
n8n-workflows/create_categories.py
e3d 56789e895e feat: Add GitHub Pages public search interface and enhanced documentation system
## 🌐 GitHub Pages Public Search Interface
- Complete client-side search application solving Issue #84
- Responsive HTML/CSS/JavaScript with mobile optimization
- Real-time search across 2,057+ workflows with instant results
- Category filtering across 15 workflow categories
- Dark/light theme support with system preference detection
- Direct workflow JSON download functionality

## 🤖 GitHub Actions Automation
- deploy-pages.yml: Automated deployment to GitHub Pages
- update-readme.yml: Weekly automated README statistics updates
- Comprehensive workflow indexing and category generation

## 🔍 Enhanced Search & Categorization
- Static search index generation for GitHub Pages
- Developer-chosen category prioritization system
- CalcsLive custom node integration and categorization
- Enhanced workflow database with better custom node detection
- Fixed README corruption with live database statistics

## 📚 Documentation & Infrastructure
- Comprehensive CHANGELOG.md with proper versioning
- Enhanced README with accurate statistics and public interface links
- Professional documentation solving repository infrastructure needs

## Technical Improvements
- Fixed Unicode encoding issues in Python scripts
- Enhanced CalcsLive detection with false positive prevention
- Improved JSON description preservation and indexing
- Mobile-optimized responsive design for all devices

🤖 Generated with Claude Code

Co-Authored-By: Claude <noreply@anthropic.com>
2025-09-29 21:54:12 -07:00

248 lines
9.7 KiB
Python

import json
import os
from pathlib import Path
import glob
import re
def load_def_categories():
"""Load the definition categories from def_categories.json"""
def_categories_path = Path("context/def_categories.json")
with open(def_categories_path, 'r', encoding='utf-8') as f:
raw_map = json.load(f)
# Normalize keys: strip non-alphanumerics and lowercase
integration_to_category = {
re.sub(r"[^a-z0-9]", "", item["integration"].lower()): item["category"]
for item in raw_map
}
return integration_to_category
def extract_tokens_from_filename(filename):
"""Extract tokens from filename by splitting on '_' and removing '.json'"""
# Remove .json extension
name_without_ext = filename.replace('.json', '')
# Split by underscore
tokens = name_without_ext.split('_')
# Convert to lowercase for matching
tokens = [token.lower() for token in tokens if token]
return tokens
def find_matching_category(tokens, integration_to_category):
"""Find the first matching category for the given tokens"""
for token in tokens:
# Normalize token same as keys
norm = re.sub(r"[^a-z0-9]", "", token.lower())
if norm in integration_to_category:
return integration_to_category[norm]
# Try partial matches for common variations
for token in tokens:
norm = re.sub(r"[^a-z0-9]", "", token.lower())
for integration_key in integration_to_category:
if norm in integration_key or integration_key in norm:
return integration_to_category[integration_key]
return ""
def categorize_by_filename(filename):
"""
Categorize workflow based on filename patterns.
Returns the most likely category or None if uncertain.
"""
filename_lower = filename.lower()
# Security & Authentication
if any(word in filename_lower for word in ['totp', 'bitwarden', 'auth', 'security']):
return "Technical Infrastructure & DevOps"
# Data Processing & File Operations
if any(word in filename_lower for word in ['process', 'writebinaryfile', 'readbinaryfile', 'extractfromfile', 'converttofile', 'googlefirebasecloudfirestore', 'supabase', 'surveymonkey', 'renamekeys', 'readpdf', 'wufoo', 'splitinbatches', 'airtop', 'comparedatasets', 'spreadsheetfile', 'calcslive']):
return "Data Processing & Analysis"
# Utility & Business Process Automation
if any(word in filename_lower for word in ['noop', 'code', 'schedule', 'filter', 'splitout', 'wait', 'limit', 'aggregate', 'acuityscheduling', 'eventbrite', 'philipshue', 'stickynote', 'n8ntrainingcustomerdatastore', 'n8n']):
return "Business Process Automation"
# Webhook & API related
if any(word in filename_lower for word in ['webhook', 'respondtowebhook', 'http', 'rssfeedread']):
return "Web Scraping & Data Extraction"
# Form & Data Collection
if any(word in filename_lower for word in ['form', 'typeform', 'jotform']):
return "Data Processing & Analysis"
# Local file operations
if any(word in filename_lower for word in ['localfile', 'filemaker']):
return "Cloud Storage & File Management"
# Database operations
if any(word in filename_lower for word in ['postgres', 'mysql', 'mongodb', 'redis', 'elasticsearch', 'snowflake']):
return "Data Processing & Analysis"
# AI & Machine Learning
if any(word in filename_lower for word in ['openai', 'awstextract', 'awsrekognition', 'humanticai', 'openthesaurus', 'googletranslate', 'summarize']):
return "AI Agent Development"
# E-commerce specific
if any(word in filename_lower for word in ['woocommerce', 'gumroad']):
return "E-commerce & Retail"
# Social media specific
if any(word in filename_lower for word in ['facebook', 'linkedin', 'instagram']):
return "Social Media Management"
# Customer support
if any(word in filename_lower for word in ['zendesk', 'intercom', 'drift', 'pagerduty']):
return "Communication & Messaging"
# Analytics & Tracking
if any(word in filename_lower for word in ['googleanalytics', 'segment', 'mixpanel']):
return "Data Processing & Analysis"
# Development tools
if any(word in filename_lower for word in ['git', 'github', 'gitlab', 'travisci', 'jenkins', 'uptimerobot', 'gsuiteadmin', 'debughelper', 'bitbucket']):
return "Technical Infrastructure & DevOps"
# CRM & Sales tools
if any(word in filename_lower for word in ['pipedrive', 'hubspot', 'salesforce', 'copper', 'orbit', 'agilecrm']):
return "CRM & Sales"
# Marketing tools
if any(word in filename_lower for word in ['mailchimp', 'convertkit', 'sendgrid', 'mailerlite', 'lemlist', 'sendy', 'postmark', 'mailgun']):
return "Marketing & Advertising Automation"
# Project management
if any(word in filename_lower for word in ['asana', 'mondaycom', 'clickup', 'trello', 'notion', 'toggl', 'microsofttodo', 'calendly', 'jira']):
return "Project Management"
# Communication
if any(word in filename_lower for word in ['slack', 'telegram', 'discord', 'mattermost', 'twilio', 'emailreadimap', 'teams', 'gotowebinar']):
return "Communication & Messaging"
# Cloud storage
if any(word in filename_lower for word in ['dropbox', 'googledrive', 'onedrive', 'awss3', 'googledocs']):
return "Cloud Storage & File Management"
# Creative tools
if any(word in filename_lower for word in ['canva', 'figma', 'bannerbear', 'editimage']):
return "Creative Design Automation"
# Video & content
if any(word in filename_lower for word in ['youtube', 'vimeo', 'storyblok', 'strapi']):
return "Creative Content & Video Automation"
# Financial tools
if any(word in filename_lower for word in ['stripe', 'chargebee', 'quickbooks', 'harvest']):
return "Financial & Accounting"
# Weather & external APIs
if any(word in filename_lower for word in ['openweathermap', 'nasa', 'crypto', 'coingecko']):
return "Web Scraping & Data Extraction"
return ""
def main():
# Load definition categories
integration_to_category = load_def_categories()
# Get all JSON files from workflows directory
workflows_dir = Path("workflows")
json_files = glob.glob(
os.path.join(workflows_dir, "**", "*.json"),
recursive=True
)
# Process each file
search_categories = []
for json_file in json_files:
path_obj = Path(json_file)
filename = path_obj.name
tokens = extract_tokens_from_filename(filename)
category = find_matching_category(tokens, integration_to_category)
search_categories.append({
"filename": filename,
"category": category
})
# Second pass for categorization
for item in search_categories:
if not item['category']:
item['category'] = categorize_by_filename(item['filename'])
# Sort by filename for consistency
search_categories.sort(key=lambda x: x['filename'])
# Write to search_categories.json
output_path = Path("context/search_categories.json")
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(search_categories, f, indent=2, ensure_ascii=False)
print(f"Generated search_categories.json with {len(search_categories)} entries")
# Generate unique categories list for API
unique_categories = set()
for item in search_categories:
if item['category']:
unique_categories.add(item['category'])
# Always include 'Uncategorized' for workflows without categories
unique_categories.add('Uncategorized')
# Sort categories alphabetically
categories_list = sorted(list(unique_categories))
# Write unique categories to a separate file for API consumption
categories_output_path = Path("context/unique_categories.json")
with open(categories_output_path, 'w', encoding='utf-8') as f:
json.dump(categories_list, f, indent=2, ensure_ascii=False)
print(f"Generated unique_categories.json with {len(categories_list)} categories")
# Print some statistics
categorized = sum(1 for item in search_categories if item['category'])
uncategorized = len(search_categories) - categorized
print(f"Categorized: {categorized}, Uncategorized: {uncategorized}")
# Print detailed category statistics
print("\n" + "="*50)
print("CATEGORY DISTRIBUTION (Top 20)")
print("="*50)
# Count categories
category_counts = {}
for item in search_categories:
category = item['category'] if item['category'] else "Uncategorized"
category_counts[category] = category_counts.get(category, 0) + 1
# Sort by count (descending)
sorted_categories = sorted(category_counts.items(), key=lambda x: x[1], reverse=True)
# Display top 20
for i, (category, count) in enumerate(sorted_categories[:20], 1):
print(f"{i:2d}. {category:<40} {count:>4} files")
if len(sorted_categories) > 20:
remaining = len(sorted_categories) - 20
print(f"\n... and {remaining} more categories")
# Write tips on uncategorized workflows
print("\n" + "="*50)
print("Tips on uncategorized workflows")
print("="*50)
print("1. At the search, you'll be able to list all uncategorized workflows.")
print("2. If the workflow JSON filename has a clear service name (eg. Twilio), it could just be we are missing its category definition at context/def_categories.json.")
print("3. You can contribute to the category definitions and then make a pull request to help improve the search experience.")
# Done message
print("\n" + "="*50)
print("Done! Search re-indexed with categories.")
print("="*50)
if __name__ == "__main__":
main()