#!/usr/bin/env python3
"""
Merge research CSV files with fuzzy deduplication and Google Sheets update
"""

import pandas as pd
import re
from difflib import SequenceMatcher
from urllib.parse import urlparse
import json

def normalize_company_name(name):
    """Normalize company name for fuzzy matching"""
    if pd.isna(name):
        return ""
    
    # Convert to lowercase
    name = str(name).lower().strip()
    
    # Remove common suffixes/prefixes
    suffixes = [
        r'\s+inc\.?$', r'\s+llc\.?$', r'\s+ltd\.?$', r'\s+corp\.?$', 
        r'\s+company\.?$', r'\s+co\.?$', r'\s+limited\.?$',
        r'\s+corporation\.?$', r'\s+group\.?$', r'\s+farms?\.?$',
        r'\s+orchards?\.?$', r'\s+growers?\.?$', r'\s+&\s+sons?\.?$',
        r'\s+&\s+co\.?$', r'\s+brothers?\.?$', r'\s+bros\.?$'
    ]
    
    for suffix in suffixes:
        name = re.sub(suffix, '', name)
    
    # Remove "the" prefix
    name = re.sub(r'^the\s+', '', name)
    
    # Remove extra spaces and punctuation
    name = re.sub(r'[^\w\s]', '', name)
    name = re.sub(r'\s+', ' ', name)
    
    return name.strip()

def similarity_score(name1, name2):
    """Calculate similarity score between two company names"""
    norm1 = normalize_company_name(name1)
    norm2 = normalize_company_name(name2)
    
    if not norm1 or not norm2:
        return 0
    
    return SequenceMatcher(None, norm1, norm2).ratio()

def find_duplicates(companies, threshold=0.85):
    """Find duplicate companies based on name similarity"""
    duplicates = []
    
    for i in range(len(companies)):
        for j in range(i + 1, len(companies)):
            score = similarity_score(companies[i]['Company Name'], companies[j]['Company Name'])
            if score >= threshold:
                duplicates.append((i, j, score))
    
    return duplicates

def choose_best_record(record1, record2):
    """Choose the record with more complete data"""
    # Count non-empty fields
    count1 = sum(1 for v in record1.values() if pd.notna(v) and str(v).strip() != "")
    count2 = sum(1 for v in record2.values() if pd.notna(v) and str(v).strip() != "")
    
    if count1 > count2:
        return record1
    elif count2 > count1:
        return record2
    else:
        # If equal, prefer the one with more detailed notes
        notes1_len = len(str(record1.get('Notes', '')))
        notes2_len = len(str(record2.get('Notes', '')))
        return record1 if notes1_len >= notes2_len else record2

def format_website_for_sheets(website):
    """Format website as HYPERLINK formula for Google Sheets"""
    if pd.isna(website) or str(website).strip() == "" or str(website).strip() == "N/A":
        return ""
    
    url = str(website).strip()
    
    # Add https:// if no protocol specified
    if not url.startswith(('http://', 'https://')):
        url = 'https://' + url
    
    # Extract display name (domain without www)
    try:
        parsed = urlparse(url)
        display_name = parsed.netloc.lower()
        if display_name.startswith('www.'):
            display_name = display_name[4:]
        return f'=HYPERLINK("{url}","{display_name}")'
    except:
        return url

def main():
    print("Starting data merge process...")
    
    # Define target columns
    target_columns = [
        'Company Name', 'Website', 'Country', 'State/Region', 'Primary Fruit',
        'CA Storage Confirmed', 'Employee Count', 'Est. Revenue ($M)',
        'Contact Name', 'Contact Title', 'Contact Email', 'Contact Phone',
        'Notes', 'Source', 'Qualified'
    ]
    
    # Read all CSV files
    files = [
        '/Users/max/.openclaw/workspace/postharvest/MASTER-LIST-FINAL.csv',
        '/Users/max/.openclaw/workspace/postharvest/RESEARCH-WESTERN-COLDCHAIN.csv',
        '/Users/max/.openclaw/workspace/postharvest/RESEARCH-MIDWEST-DISTRIBUTORS.csv',
        '/Users/max/.openclaw/workspace/postharvest/RESEARCH-INTERNATIONAL-2.csv'
    ]
    
    all_data = []
    
    for file_path in files:
        print(f"Reading {file_path.split('/')[-1]}...")
        df = pd.read_csv(file_path)
        
        # Normalize columns - add missing ones
        for col in target_columns:
            if col not in df.columns:
                df[col] = ""
        
        # Reorder columns to match target
        df = df[target_columns]
        
        # Add to combined data
        all_data.append(df)
        print(f"  - Loaded {len(df)} records")
    
    # Combine all data
    combined_df = pd.concat(all_data, ignore_index=True)
    print(f"\nTotal records before deduplication: {len(combined_df)}")
    
    # Convert to list of dictionaries for processing
    companies = combined_df.to_dict('records')
    
    # Find and resolve duplicates
    print("Finding duplicates...")
    duplicates = find_duplicates(companies, threshold=0.85)
    print(f"Found {len(duplicates)} potential duplicate pairs")
    
    # Create a set to track indices to remove
    to_remove = set()
    
    for i, j, score in sorted(duplicates, key=lambda x: x[2], reverse=True):
        if i not in to_remove and j not in to_remove:
            # Choose the better record
            best_record = choose_best_record(companies[i], companies[j])
            
            # Update the first index with the best record, mark the second for removal
            companies[i] = best_record
            to_remove.add(j)
            
            print(f"  Merged: '{companies[i]['Company Name']}' (similarity: {score:.2f})")
    
    # Remove duplicates
    deduplicated_companies = [companies[i] for i in range(len(companies)) if i not in to_remove]
    
    print(f"Records after deduplication: {len(deduplicated_companies)}")
    
    # Convert back to DataFrame
    final_df = pd.DataFrame(deduplicated_companies)
    
    # Sort by Country, State/Region, Company Name
    final_df = final_df.sort_values(['Country', 'State/Region', 'Company Name'], na_position='last')
    
    # Reset index
    final_df = final_df.reset_index(drop=True)
    
    # Save to CSV
    output_path = '/Users/max/.openclaw/workspace/postharvest/MASTER-LIST-FINAL.csv'
    final_df.to_csv(output_path, index=False)
    print(f"\nSaved merged data to {output_path}")
    
    # Prepare data for Google Sheets
    print("Preparing data for Google Sheets...")
    
    # Format websites as hyperlinks
    sheets_data = final_df.copy()
    sheets_data['Website'] = sheets_data['Website'].apply(format_website_for_sheets)
    
    # Convert to 2D array for Google Sheets
    headers = list(sheets_data.columns)
    data_rows = sheets_data.values.tolist()
    
    # Convert NaN values to empty strings
    for i, row in enumerate(data_rows):
        data_rows[i] = [str(val) if pd.notna(val) else "" for val in row]
    
    # Combine headers and data
    sheets_array = [headers] + data_rows
    
    # Save the JSON for the Google Sheets command
    json_file = '/Users/max/.openclaw/workspace/postharvest/sheets_data.json'
    with open(json_file, 'w') as f:
        json.dump(sheets_array, f)
    
    print(f"Prepared {len(final_df)} records for Google Sheets")
    print(f"JSON data saved to {json_file}")
    
    return len(final_df)

if __name__ == "__main__":
    final_count = main()
    print(f"\n✅ Merge complete! Final count: {final_count} unique companies")