n8n-workflows/fix_duplicate_workflows.py

#!/usr/bin/env python3
"""
Fix Duplicate Workflow Display Issue
Addresses Issue #99: UI displays duplicate entries for same workflows.
"""

import json
import os
from pathlib import Path
from typing import Dict, List, Set
import hashlib

def find_duplicate_workflows(workflows_dir: str = "workflows") -> Dict[str, List[Path]]:
    """Find duplicate workflow files based on content hash."""
    workflows_path = Path(workflows_dir)

    if not workflows_path.exists():
        print(f"Error: workflows directory not found at {workflows_path}")
        return {}

    # Dictionary to store hash -> list of file paths
    hash_to_files = {}

    # Process all JSON files
    json_files = list(workflows_path.rglob('*.json'))
    print(f"Analyzing {len(json_files)} workflow files for duplicates...")

    for file_path in json_files:
        try:
            # Read and normalize the JSON content
            with open(file_path, 'r', encoding='utf-8') as f:
                content = json.load(f)

            # Remove volatile fields that might differ between duplicates
            normalized = content.copy()
            normalized.pop('createdAt', None)
            normalized.pop('updatedAt', None)
            normalized.pop('id', None)  # Workflow ID might be different

            # Create hash of normalized content
            content_str = json.dumps(normalized, sort_keys=True)
            content_hash = hashlib.sha256(content_str.encode()).hexdigest()

            # Store file path by hash
            if content_hash not in hash_to_files:
                hash_to_files[content_hash] = []
            hash_to_files[content_hash].append(file_path)

        except Exception as e:
            print(f"Error processing {file_path}: {e}")
            continue

    # Filter to only keep hashes with duplicates
    duplicates = {
        hash_val: files
        for hash_val, files in hash_to_files.items()
        if len(files) > 1
    }

    return duplicates

def find_name_duplicates(workflows_dir: str = "workflows") -> Dict[str, List[Path]]:
    """Find workflows with duplicate names (not necessarily same content)."""
    workflows_path = Path(workflows_dir)

    if not workflows_path.exists():
        return {}

    # Dictionary to store workflow name -> list of file paths
    name_to_files = {}

    json_files = list(workflows_path.rglob('*.json'))

    for file_path in json_files:
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = json.load(f)

            workflow_name = content.get('name', 'Unnamed')

            if workflow_name not in name_to_files:
                name_to_files[workflow_name] = []
            name_to_files[workflow_name].append(file_path)

        except Exception as e:
            continue

    # Filter to only keep names with duplicates
    duplicates = {
        name: files
        for name, files in name_to_files.items()
        if len(files) > 1
    }

    return duplicates

def remove_exact_duplicates(duplicates: Dict[str, List[Path]], dry_run: bool = True) -> int:
    """Remove exact duplicate files, keeping only one copy."""
    removed_count = 0

    for content_hash, file_paths in duplicates.items():
        # Sort by path to ensure consistent ordering
        file_paths.sort()

        # Keep the first file, remove the rest
        to_keep = file_paths[0]
        to_remove = file_paths[1:]

        print(f"\nFound {len(file_paths)} identical workflows:")
        print(f"  Keeping: {to_keep.name}")
        for path in to_remove:
            print(f"  Removing: {path.name}")

            if not dry_run:
                try:
                    os.remove(path)
                    removed_count += 1
                    print(f"    ✅ Removed {path}")
                except Exception as e:
                    print(f"    ❌ Error removing {path}: {e}")

    return removed_count

def update_workflow_database():
    """Update the workflow database to remove duplicate entries."""
    try:
        import sys
        sys.path.append(str(Path(__file__).parent))
        from workflow_db import WorkflowDatabase

        # Re-index the database
        db = WorkflowDatabase()
        db.index_all_workflows(force_reindex=True)
        print("✅ Database re-indexed to remove duplicate entries")
        return True
    except Exception as e:
        print(f"Error updating database: {e}")
        return False

def fix_ui_duplicate_display():
    """Fix the UI to handle duplicate workflows properly."""

    # Update search_categories.json to remove duplicates
    categories_file = Path('context/search_categories.json')

    if categories_file.exists():
        with open(categories_file, 'r', encoding='utf-8') as f:
            categories_data = json.load(f)

        # Remove duplicate entries based on filename
        seen_filenames = set()
        unique_data = []

        for item in categories_data:
            filename = item.get('filename')
            if filename and filename not in seen_filenames:
                seen_filenames.add(filename)
                unique_data.append(item)

        # Save deduplicated data
        with open(categories_file, 'w', encoding='utf-8') as f:
            json.dump(unique_data, f, indent=2, ensure_ascii=False)

        print(f"✅ Removed {len(categories_data) - len(unique_data)} duplicate entries from search_categories.json")

    # Regenerate search index
    try:
        import subprocess
        result = subprocess.run(
            ['python3', 'scripts/generate_search_index.py'],
            capture_output=True,
            text=True
        )
        if result.returncode == 0:
            print("✅ Regenerated search index")
        else:
            print(f"Error regenerating search index: {result.stderr}")
    except Exception as e:
        print(f"Error regenerating search index: {e}")

def main():
    """Main function to fix duplicate workflow issues."""
    import argparse

    parser = argparse.ArgumentParser(description='Fix duplicate workflow display issues')
    parser.add_argument('--check', action='store_true', help='Only check for duplicates, do not fix')
    parser.add_argument('--fix-files', action='store_true', help='Remove duplicate files')
    parser.add_argument('--fix-ui', action='store_true', help='Fix UI duplicate display')
    parser.add_argument('--fix-all', action='store_true', help='Fix everything')

    args = parser.parse_args()

    print("🔍 Duplicate Workflow Fixer")
    print("=" * 60)

    # Find exact content duplicates
    print("\n📄 Checking for exact duplicate workflows...")
    exact_duplicates = find_duplicate_workflows()

    if exact_duplicates:
        print(f"\n⚠️  Found {len(exact_duplicates)} groups of duplicate workflows")
        total_duplicates = sum(len(files) - 1 for files in exact_duplicates.values())
        print(f"   Total duplicate files that can be removed: {total_duplicates}")

        if args.fix_files or args.fix_all:
            print("\n🗑️  Removing duplicate files...")
            removed = remove_exact_duplicates(exact_duplicates, dry_run=False)
            print(f"\n✅ Removed {removed} duplicate files")
    else:
        print("✅ No exact duplicate workflows found")

    # Find name duplicates (might be different content)
    print("\n📝 Checking for workflows with duplicate names...")
    name_duplicates = find_name_duplicates()

    if name_duplicates:
        print(f"\n⚠️  Found {len(name_duplicates)} workflow names used multiple times")
        for name, files in list(name_duplicates.items())[:5]:  # Show first 5
            print(f"   '{name}': {len(files)} files")
        if len(name_duplicates) > 5:
            print(f"   ... and {len(name_duplicates) - 5} more")
    else:
        print("✅ No duplicate workflow names found")

    # Fix UI display issues
    if args.fix_ui or args.fix_all:
        print("\n🖥️  Fixing UI duplicate display...")
        fix_ui_duplicate_display()
        update_workflow_database()
        print("✅ UI display fixes applied")

    if args.check:
        print("\n💡 Run with --fix-all to automatically fix all issues")

    print("\n✨ Duplicate check complete!")

if __name__ == "__main__":
    main()