mirror of
https://github.com/Zie619/n8n-workflows.git
synced 2025-11-25 11:29:34 +08:00
fix: recursive indexing and normalization in create_categories.py & workflow_db.py (fixes #82)
147 lines
5.1 KiB
Python
147 lines
5.1 KiB
Python
import json
|
|
import os
|
|
from pathlib import Path
|
|
import glob
|
|
import re
|
|
|
|
def load_def_categories():
|
|
"""Load the definition categories from def_categories.json"""
|
|
def_categories_path = Path("context/def_categories.json")
|
|
with open(def_categories_path, 'r', encoding='utf-8') as f:
|
|
raw_map = json.load(f)
|
|
|
|
# Normalize keys: strip non-alphanumerics and lowercase
|
|
integration_to_category = {
|
|
re.sub(r"[^a-z0-9]", "", item["integration"].lower()): item["category"]
|
|
for item in raw_map
|
|
}
|
|
return integration_to_category
|
|
|
|
def extract_tokens_from_filename(filename):
|
|
"""Extract tokens from filename by splitting on '_' and removing '.json'"""
|
|
# Remove .json extension
|
|
name_without_ext = filename.replace('.json', '')
|
|
|
|
# Split by underscore
|
|
tokens = name_without_ext.split('_')
|
|
|
|
# Convert to lowercase for matching
|
|
tokens = [token.lower() for token in tokens if token]
|
|
|
|
return tokens
|
|
|
|
def find_matching_category(tokens, integration_to_category):
|
|
"""Find the first matching category for the given tokens"""
|
|
for token in tokens:
|
|
# Normalize token same as keys
|
|
norm = re.sub(r"[^a-z0-9]", "", token.lower())
|
|
if norm in integration_to_category:
|
|
return integration_to_category[norm]
|
|
|
|
# Try partial matches for common variations
|
|
for token in tokens:
|
|
norm = re.sub(r"[^a-z0-9]", "", token.lower())
|
|
for integration_key in integration_to_category:
|
|
if norm in integration_key or integration_key in norm:
|
|
return integration_to_category[integration_key]
|
|
|
|
return ""
|
|
|
|
def main():
|
|
# Load definition categories
|
|
integration_to_category = load_def_categories()
|
|
|
|
# Get all JSON files from workflows directory
|
|
workflows_dir = Path("workflows")
|
|
json_files = glob.glob(
|
|
os.path.join(workflows_dir, "**", "*.json"),
|
|
recursive=True
|
|
)
|
|
|
|
# Process each file
|
|
search_categories = []
|
|
|
|
for json_file in json_files:
|
|
path_obj = Path(json_file)
|
|
filename = path_obj.name
|
|
tokens = extract_tokens_from_filename(filename)
|
|
category = find_matching_category(tokens, integration_to_category)
|
|
|
|
search_categories.append({
|
|
"filename": filename,
|
|
"category": category
|
|
})
|
|
|
|
# Sort by filename for consistency
|
|
search_categories.sort(key=lambda x: x['filename'])
|
|
|
|
# Write to search_categories.json
|
|
output_path = Path("context/search_categories.json")
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
json.dump(search_categories, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"Generated search_categories.json with {len(search_categories)} entries")
|
|
|
|
# Generate unique categories list for API
|
|
unique_categories = set()
|
|
for item in search_categories:
|
|
if item['category']:
|
|
unique_categories.add(item['category'])
|
|
|
|
# Always include 'Uncategorized' for workflows without categories
|
|
unique_categories.add('Uncategorized')
|
|
|
|
# Sort categories alphabetically
|
|
categories_list = sorted(list(unique_categories))
|
|
|
|
# Write unique categories to a separate file for API consumption
|
|
categories_output_path = Path("context/unique_categories.json")
|
|
with open(categories_output_path, 'w', encoding='utf-8') as f:
|
|
json.dump(categories_list, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"Generated unique_categories.json with {len(categories_list)} categories")
|
|
|
|
# Print some statistics
|
|
categorized = sum(1 for item in search_categories if item['category'])
|
|
uncategorized = len(search_categories) - categorized
|
|
print(f"Categorized: {categorized}, Uncategorized: {uncategorized}")
|
|
|
|
# Print detailed category statistics
|
|
print("\n" + "="*50)
|
|
print("CATEGORY DISTRIBUTION (Top 20)")
|
|
print("="*50)
|
|
|
|
# Count categories
|
|
category_counts = {}
|
|
for item in search_categories:
|
|
category = item['category'] if item['category'] else "Uncategorized"
|
|
category_counts[category] = category_counts.get(category, 0) + 1
|
|
|
|
# Sort by count (descending)
|
|
sorted_categories = sorted(category_counts.items(), key=lambda x: x[1], reverse=True)
|
|
|
|
# Display top 20
|
|
for i, (category, count) in enumerate(sorted_categories[:20], 1):
|
|
print(f"{i:2d}. {category:<40} {count:>4} files")
|
|
|
|
if len(sorted_categories) > 20:
|
|
remaining = len(sorted_categories) - 20
|
|
print(f"\n... and {remaining} more categories")
|
|
|
|
# Write tips on uncategorized workflows
|
|
print("\n" + "="*50)
|
|
print("Tips on uncategorized workflows")
|
|
print("="*50)
|
|
print("1. At the search, you'll be able to list all uncategorized workflows.")
|
|
print("2. If the workflow JSON filename has a clear service name (eg. Twilio), it could just be we are missing its category definition at context/def_categories.json.")
|
|
print("3. You can contribute to the category definitions and then make a pull request to help improve the search experience.")
|
|
|
|
|
|
# Done message
|
|
print("\n" + "="*50)
|
|
print("Done! Search re-indexed with categories.")
|
|
print("="*50)
|
|
|
|
if __name__ == "__main__":
|
|
main()
|