#!/usr/bin/env python3
import pandas as pd
import json
import re
from fuzzywuzzy import fuzz
from pathlib import Path

def normalize_company_name(name):
    """Normalize company name for fuzzy matching by removing common variations"""
    if pd.isna(name):
        return ""
    
    name = str(name).strip()
    # Convert to lowercase for comparison
    name_lower = name.lower()
    
    # Remove common suffixes and prefixes
    patterns = [
        r'\s+(inc\.?|incorporated)$',
        r'\s+(llc\.?)$', 
        r'\s+(ltd\.?|limited)$',
        r'\s+(corp\.?|corporation)$',
        r'\s+(co\.?)$',
        r'\s+(company)$',
        r'\s+(group)$',
        r'\s+s\.a\.c\.i\.y?\s*a\.?$',
        r'\s+s\.a\.?$',
        r'\s+pty\.?\s+ltd\.?$',
        r'\s+gmbh\.?$',
        r'\s+b\.?v\.?$',
        r'^the\s+',
    ]
    
    for pattern in patterns:
        name_lower = re.sub(pattern, '', name_lower)
    
    # Normalize spaces and punctuation
    name_lower = re.sub(r'[&]', 'and', name_lower)
    name_lower = re.sub(r'[^\w\s]', '', name_lower)
    name_lower = re.sub(r'\s+', ' ', name_lower).strip()
    
    return name_lower

def find_duplicate(new_row, existing_df, threshold=80):
    """Find if a company already exists using fuzzy matching"""
    new_name = normalize_company_name(new_row['Company Name'])
    if not new_name:
        return None
    
    for idx, existing_row in existing_df.iterrows():
        existing_name = normalize_company_name(existing_row['Company Name'])
        if not existing_name:
            continue
            
        # Use fuzzy matching
        ratio = fuzz.ratio(new_name, existing_name)
        if ratio >= threshold:
            return idx
    
    return None

def count_non_empty_fields(row):
    """Count non-empty fields in a row to determine which has more data"""
    count = 0
    for field in row:
        if pd.notna(field) and str(field).strip() != '':
            count += 1
    return count

def merge_companies():
    # Define target columns
    target_columns = [
        'Company Name', 'Website', 'Country', 'State/Region', 'Primary Fruit',
        'CA Storage Confirmed', 'Employee Count', 'Est. Revenue ($M)', 
        'Contact Name', 'Contact Title', 'Contact Email', 'Contact Phone',
        'Notes', 'Source', 'Qualified'
    ]
    
    # Read all files
    print("Reading files...")
    
    # Master file
    master_df = pd.read_csv('/Users/max/.openclaw/workspace/postharvest/MASTER-LIST-FINAL.csv')
    print(f"Master file: {len(master_df)} records")
    
    # Research file 1
    research1_df = pd.read_csv('/Users/max/.openclaw/workspace/postharvest/RESEARCH-FINAL-PUSH-1.csv')
    # Add missing columns
    for col in target_columns:
        if col not in research1_df.columns:
            research1_df[col] = ''
    research1_df = research1_df[target_columns]
    print(f"Research file 1: {len(research1_df)} records")
    
    # Research file 2
    research2_df = pd.read_csv('/Users/max/.openclaw/workspace/postharvest/RESEARCH-FINAL-PUSH-2.csv')
    # Add missing columns
    for col in target_columns:
        if col not in research2_df.columns:
            research2_df[col] = ''
    research2_df = research2_df[target_columns]
    print(f"Research file 2: {len(research2_df)} records")
    
    # Ensure master file has correct columns
    master_df = master_df[target_columns]
    
    print(f"Total records before deduplication: {len(master_df) + len(research1_df) + len(research2_df)}")
    
    # Start with master file as base
    result_df = master_df.copy()
    
    # Process research files
    for research_df, file_name in [(research1_df, "Research 1"), (research2_df, "Research 2")]:
        print(f"\nProcessing {file_name}...")
        added_count = 0
        duplicate_count = 0
        
        for idx, new_row in research_df.iterrows():
            # Find if this company already exists
            existing_idx = find_duplicate(new_row, result_df, threshold=80)
            
            if existing_idx is not None:
                # Duplicate found - keep record with more data
                existing_row = result_df.loc[existing_idx]
                
                new_data_count = count_non_empty_fields(new_row)
                existing_data_count = count_non_empty_fields(existing_row)
                
                if new_data_count > existing_data_count:
                    # Replace with new row that has more data
                    result_df.loc[existing_idx] = new_row
                    print(f"  Replaced: {existing_row['Company Name']} -> {new_row['Company Name']}")
                else:
                    # Keep existing row
                    print(f"  Kept existing: {existing_row['Company Name']} (vs {new_row['Company Name']})")
                
                duplicate_count += 1
            else:
                # New company - add it
                result_df = pd.concat([result_df, pd.DataFrame([new_row])], ignore_index=True)
                added_count += 1
        
        print(f"  Added: {added_count}, Duplicates handled: {duplicate_count}")
    
    # Sort by Country, State/Region, Company Name
    print(f"\nSorting {len(result_df)} records...")
    result_df = result_df.sort_values(['Country', 'State/Region', 'Company Name'], 
                                      key=lambda x: x.astype(str).str.lower())
    result_df = result_df.reset_index(drop=True)
    
    # Save to file
    output_path = '/Users/max/.openclaw/workspace/postharvest/MASTER-LIST-FINAL.csv'
    print(f"\nSaving to {output_path}...")
    result_df.to_csv(output_path, index=False)
    
    print(f"Final count: {len(result_df)} companies")
    
    return result_df

def prepare_for_google_sheets(df):
    """Prepare data for Google Sheets with HYPERLINK formulas"""
    print("\nPreparing data for Google Sheets...")
    
    # Create copy for Google Sheets
    sheets_data = []
    
    # Add header row
    headers = [
        'Company Name', 'Website', 'Country', 'State/Region', 'Primary Fruit',
        'CA Storage Confirmed', 'Employee Count', 'Est. Revenue ($M)', 
        'Contact Name', 'Contact Title', 'Contact Email', 'Contact Phone',
        'Notes', 'Source', 'Qualified'
    ]
    sheets_data.append(headers)
    
    # Process each row
    for _, row in df.iterrows():
        sheets_row = []
        for col in headers:
            value = row[col]
            
            # Handle Website column with HYPERLINK formula
            if col == 'Website' and pd.notna(value) and str(value).strip():
                website = str(value).strip()
                if website.startswith('http'):
                    # Create HYPERLINK formula
                    display_name = website.replace('https://', '').replace('http://', '').rstrip('/')
                    if display_name.startswith('www.'):
                        display_name = display_name[4:]
                    hyperlink_formula = f'=HYPERLINK("{website}","{display_name}")'
                    sheets_row.append(hyperlink_formula)
                else:
                    sheets_row.append(website)
            else:
                # Regular cell value
                if pd.isna(value):
                    sheets_row.append('')
                else:
                    sheets_row.append(str(value))
        
        sheets_data.append(sheets_row)
    
    return sheets_data

if __name__ == "__main__":
    # Merge companies
    final_df = merge_companies()
    
    # Prepare for Google Sheets
    sheets_data = prepare_for_google_sheets(final_df)
    
    # Save sheets data as JSON for the gog command
    with open('/Users/max/.openclaw/workspace/postharvest/sheets_data.json', 'w') as f:
        json.dump(sheets_data, f, indent=2)
    
    print(f"\nGoogle Sheets data prepared with {len(sheets_data)} rows (including header)")
    print("Ready for Google Sheets upload!")