#!/usr/bin/env python3
"""
Merge research CSV files into master list with deduplication and normalization
"""

import csv
import re
from difflib import SequenceMatcher

def similarity(a, b):
    """Calculate similarity between two strings"""
    return SequenceMatcher(None, a, b).ratio()

def normalize_company_name(name):
    """Normalize company name for fuzzy matching"""
    if not name:
        return ""
    # Convert to lower case and remove common suffixes/prefixes
    name = str(name).lower().strip()
    # Remove common business suffixes
    patterns = [
        r'\s+(inc\.?|llc\.?|ltd\.?|corp\.?|corporation|company|co\.?|ltd|limited)$',
        r'\s+&\s+co\.?$',
        r'\s+group$',
        r'\s+farms?$',
        r'\s+orchards?$',
        r'\s+growers?$'
    ]
    
    for pattern in patterns:
        name = re.sub(pattern, '', name, flags=re.IGNORECASE)
    
    # Remove extra whitespace and punctuation
    name = re.sub(r'\s+', ' ', name)
    name = re.sub(r'[^\w\s]', '', name)
    
    return name.strip()

def is_duplicate(name1, name2, threshold=0.85):
    """Check if two company names are likely duplicates"""
    norm1 = normalize_company_name(name1)
    norm2 = normalize_company_name(name2)
    
    if not norm1 or not norm2:
        return False
    
    # Exact match after normalization
    if norm1 == norm2:
        return True
    
    # Fuzzy match
    return similarity(norm1, norm2) >= threshold

def read_and_normalize_csv(file_path, default_source):
    """Read and normalize a CSV file to target schema"""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            rows = list(reader)
            
        print(f"Read {len(rows)} records from {file_path}")
        
        # Target columns
        target_columns = [
            'Company Name', 'Website', 'Country', 'State/Region', 'Primary Fruit',
            'CA Storage Confirmed', 'Employee Count', 'Est. Revenue ($M)', 
            'Contact Name', 'Contact Title', 'Contact Email', 'Contact Phone',
            'Notes', 'Source', 'Qualified'
        ]
        
        # Normalize each row
        normalized_rows = []
        for row in rows:
            normalized_row = {}
            
            # Map existing columns and set defaults
            for col in target_columns:
                if col in row and row[col]:
                    normalized_row[col] = row[col]
                else:
                    # Set default values for missing columns
                    if col in ['Contact Name', 'Contact Title', 'Contact Email', 'Contact Phone']:
                        normalized_row[col] = ''
                    elif col == 'Qualified':
                        # Set new research to "Yes" since it's been researched and qualified
                        if default_source in ['Max Research - Eastern US', 'Max Research - Citrus/Tropical', 'Max Research - International']:
                            normalized_row[col] = 'Yes'
                        else:
                            normalized_row[col] = ''
                    elif col == 'Source':
                        normalized_row[col] = row.get('Source', default_source)
                    else:
                        normalized_row[col] = ''
            
            normalized_rows.append(normalized_row)
            
        return normalized_rows
        
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return []

def deduplicate_companies(all_rows):
    """Remove duplicate companies using fuzzy matching"""
    print(f"Starting deduplication with {len(all_rows)} companies")
    
    # Track which companies to keep
    keep_rows = []
    processed_names = []
    
    for row in all_rows:
        company_name = row.get('Company Name', '').strip()
        if not company_name:
            continue
            
        # Check if this is a duplicate of an already processed company
        is_dup = False
        for processed_name in processed_names:
            if is_duplicate(company_name, processed_name):
                print(f"Found duplicate: '{company_name}' matches '{processed_name}'")
                is_dup = True
                break
                
        if not is_dup:
            keep_rows.append(row)
            processed_names.append(company_name)
    
    print(f"After deduplication: {len(keep_rows)} companies (removed {len(all_rows) - len(keep_rows)} duplicates)")
    
    return keep_rows

def sort_companies(rows):
    """Sort companies by Country, State/Region, Company Name"""
    def sort_key(row):
        country = row.get('Country', '') or ''
        state = row.get('State/Region', '') or ''
        company = row.get('Company Name', '') or ''
        return (country.lower(), state.lower(), company.lower())
    
    return sorted(rows, key=sort_key)

def write_csv(rows, file_path):
    """Write rows to CSV file"""
    target_columns = [
        'Company Name', 'Website', 'Country', 'State/Region', 'Primary Fruit',
        'CA Storage Confirmed', 'Employee Count', 'Est. Revenue ($M)', 
        'Contact Name', 'Contact Title', 'Contact Email', 'Contact Phone',
        'Notes', 'Source', 'Qualified'
    ]
    
    with open(file_path, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=target_columns)
        writer.writeheader()
        for row in rows:
            writer.writerow(row)

def main():
    # File paths
    master_file = '/Users/max/.openclaw/workspace/postharvest/MASTER-LIST-FINAL.csv'
    eastern_file = '/Users/max/.openclaw/workspace/postharvest/RESEARCH-EASTERN-US.csv'
    citrus_file = '/Users/max/.openclaw/workspace/postharvest/RESEARCH-CITRUS-TROPICAL.csv'
    international_file = '/Users/max/.openclaw/workspace/postharvest/RESEARCH-INTERNATIONAL.csv'
    output_file = '/Users/max/.openclaw/workspace/postharvest/MASTER-LIST-FINAL.csv'
    
    # Read and normalize all files
    print("Reading and normalizing files...")
    
    master_rows = read_and_normalize_csv(master_file, 'Existing Master')
    eastern_rows = read_and_normalize_csv(eastern_file, 'Max Research - Eastern US')
    citrus_rows = read_and_normalize_csv(citrus_file, 'Max Research - Citrus/Tropical')
    international_rows = read_and_normalize_csv(international_file, 'Max Research - International')
    
    # Combine all data
    print("Combining all data...")
    all_rows = master_rows + eastern_rows + citrus_rows + international_rows
    print(f"Combined total: {len(all_rows)} records")
    
    # Filter out rows without company names
    all_rows = [row for row in all_rows if row.get('Company Name', '').strip()]
    print(f"After removing empty company names: {len(all_rows)} records")
    
    # Deduplicate
    final_rows = deduplicate_companies(all_rows)
    
    # Sort by Country, State/Region, Company Name
    print("Sorting data...")
    final_rows = sort_companies(final_rows)
    
    # Save to file
    print(f"Saving {len(final_rows)} companies to {output_file}")
    write_csv(final_rows, output_file)
    
    # Print summary
    print(f"\n=== MERGE COMPLETE ===")
    print(f"Final count: {len(final_rows)} companies")
    
    # Count by country
    country_counts = {}
    for row in final_rows:
        country = row.get('Country', 'Unknown')
        country_counts[country] = country_counts.get(country, 0) + 1
    
    print(f"By country:")
    for country in sorted(country_counts.keys()):
        count = country_counts[country]
        print(f"  {country}: {count} companies")
    
    return len(final_rows)

if __name__ == "__main__":
    main()