2025-06-29 03:08:41 -07:00
import json
import os
from pathlib import Path
def load_def_categories ( ) :
""" Load the definition categories from def_categories.json """
def_categories_path = Path ( " context/def_categories.json " )
with open ( def_categories_path , ' r ' , encoding = ' utf-8 ' ) as f :
categories_data = json . load ( f )
# Create a mapping from integration name (lowercase) to category
integration_to_category = { }
for item in categories_data :
integration = item [ ' integration ' ] . lower ( )
category = item [ ' category ' ]
integration_to_category [ integration ] = category
return integration_to_category
def extract_tokens_from_filename ( filename ) :
""" Extract tokens from filename by splitting on ' _ ' and removing ' .json ' """
# Remove .json extension
name_without_ext = filename . replace ( ' .json ' , ' ' )
# Split by underscore
tokens = name_without_ext . split ( ' _ ' )
# Convert to lowercase for matching
tokens = [ token . lower ( ) for token in tokens if token ]
return tokens
def find_matching_category ( tokens , integration_to_category ) :
""" Find the first matching category for the given tokens """
for token in tokens :
if token in integration_to_category :
return integration_to_category [ token ]
# Try partial matches for common variations
for token in tokens :
for integration in integration_to_category :
if token in integration or integration in token :
return integration_to_category [ integration ]
return " "
def main ( ) :
# Load definition categories
integration_to_category = load_def_categories ( )
# Get all JSON files from workflows directory
workflows_dir = Path ( " workflows " )
json_files = list ( workflows_dir . glob ( " *.json " ) )
# Process each file
search_categories = [ ]
for json_file in json_files :
filename = json_file . name
tokens = extract_tokens_from_filename ( filename )
category = find_matching_category ( tokens , integration_to_category )
search_categories . append ( {
" filename " : filename ,
" category " : category
} )
# Sort by filename for consistency
search_categories . sort ( key = lambda x : x [ ' filename ' ] )
# Write to search_categories.json
output_path = Path ( " context/search_categories.json " )
with open ( output_path , ' w ' , encoding = ' utf-8 ' ) as f :
json . dump ( search_categories , f , indent = 2 , ensure_ascii = False )
print ( f " Generated search_categories.json with { len ( search_categories ) } entries " )
2025-06-29 03:26:18 -07:00
# Generate unique categories list for API
unique_categories = set ( )
for item in search_categories :
if item [ ' category ' ] :
unique_categories . add ( item [ ' category ' ] )
# Always include 'Uncategorized' for workflows without categories
unique_categories . add ( ' Uncategorized ' )
# Sort categories alphabetically
categories_list = sorted ( list ( unique_categories ) )
# Write unique categories to a separate file for API consumption
categories_output_path = Path ( " context/unique_categories.json " )
with open ( categories_output_path , ' w ' , encoding = ' utf-8 ' ) as f :
json . dump ( categories_list , f , indent = 2 , ensure_ascii = False )
print ( f " Generated unique_categories.json with { len ( categories_list ) } categories " )
2025-06-29 03:08:41 -07:00
# Print some statistics
categorized = sum ( 1 for item in search_categories if item [ ' category ' ] )
uncategorized = len ( search_categories ) - categorized
print ( f " Categorized: { categorized } , Uncategorized: { uncategorized } " )
# Print detailed category statistics
print ( " \n " + " = " * 50 )
print ( " CATEGORY DISTRIBUTION (Top 20) " )
print ( " = " * 50 )
# Count categories
category_counts = { }
for item in search_categories :
category = item [ ' category ' ] if item [ ' category ' ] else " Uncategorized "
category_counts [ category ] = category_counts . get ( category , 0 ) + 1
# Sort by count (descending)
sorted_categories = sorted ( category_counts . items ( ) , key = lambda x : x [ 1 ] , reverse = True )
# Display top 20
for i , ( category , count ) in enumerate ( sorted_categories [ : 20 ] , 1 ) :
print ( f " { i : 2d } . { category : <40 } { count : >4 } files " )
if len ( sorted_categories ) > 20 :
remaining = len ( sorted_categories ) - 20
print ( f " \n ... and { remaining } more categories " )
# Write tips on uncategorized workflows
print ( " \n " + " = " * 50 )
print ( " Tips on uncategorized workflows " )
print ( " = " * 50 )
print ( " 1. At the search, you ' ll be able to list all uncategorized workflows. " )
print ( " 2. If the workflow JSON filename has a clear service name (eg. Twilio), it could just be we are missing its category definition at context/def_categories.json. " )
print ( " 3. You can contribute to the category definitions and then make a pull request to help improve the search experience. " )
# Done message
print ( " \n " + " = " * 50 )
print ( " Done! Search re-indexed with categories. " )
print ( " = " * 50 )
if __name__ == " __main__ " :
main ( )