#!/usr/bin/env python3
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz
import json
import subprocess
import sys
import os

def normalize_columns(df, target_columns):
    """Add missing columns to dataframe"""
    for col in target_columns:
        if col not in df.columns:
            df[col] = ""
    return df[target_columns]

def fuzzy_deduplicate(df, threshold=80):
    """Remove duplicates using fuzzy string matching on company names"""
    print(f"Starting deduplication with {len(df)} records...")
    
    # Create a list to track which records to keep
    keep_indices = []
    processed_names = set()
    
    for idx, row in df.iterrows():
        company_name = str(row['Company Name']).strip()
        
        # Skip if empty
        if not company_name or company_name.lower() in ['', 'nan']:
            continue
            
        # Check if this company name is similar to any we've already processed
        is_duplicate = False
        best_match_idx = None
        best_score = 0
        
        for prev_idx in keep_indices:
            prev_name = str(df.loc[prev_idx, 'Company Name']).strip()
            score = fuzz.ratio(company_name.lower(), prev_name.lower())
            
            if score >= threshold:
                is_duplicate = True
                if score > best_score:
                    best_score = score
                    best_match_idx = prev_idx
                break
        
        if is_duplicate and best_match_idx is not None:
            # Compare which record has more data
            current_data_count = sum(1 for x in row if pd.notna(x) and str(x).strip() != '')
            prev_data_count = sum(1 for x in df.loc[best_match_idx] if pd.notna(x) and str(x).strip() != '')
            
            print(f"Duplicate found: '{company_name}' vs '{df.loc[best_match_idx, 'Company Name']}' (score: {best_score})")
            
            # Keep the record with more data
            if current_data_count > prev_data_count:
                # Replace the previous record with current one
                keep_indices.remove(best_match_idx)
                keep_indices.append(idx)
                print(f"  Keeping new record (more data: {current_data_count} vs {prev_data_count})")
            else:
                print(f"  Keeping previous record (more data: {prev_data_count} vs {current_data_count})")
        else:
            keep_indices.append(idx)
    
    result_df = df.loc[keep_indices].copy()
    print(f"Deduplication complete: {len(df)} -> {len(result_df)} records (removed {len(df) - len(result_df)} duplicates)")
    
    return result_df

def create_hyperlink_formula(url, display_text):
    """Create Google Sheets HYPERLINK formula"""
    if pd.isna(url) or str(url).strip() == '':
        return ''
    
    url_str = str(url).strip()
    if not url_str.startswith(('http://', 'https://')):
        if url_str:
            url_str = 'https://' + url_str
        else:
            return ''
    
    # Extract domain for display
    try:
        from urllib.parse import urlparse
        parsed = urlparse(url_str)
        domain = parsed.netloc.replace('www.', '')
        if not domain:
            domain = url_str
        display = domain if display_text == 'auto' else display_text
        return f'=HYPERLINK("{url_str}","{display}")'
    except:
        return url_str

def main():
    # Define target columns
    target_columns = [
        'Company Name', 'Website', 'Country', 'State/Region', 'Primary Fruit',
        'CA Storage Confirmed', 'Employee Count', 'Est. Revenue ($M)',
        'Contact Name', 'Contact Title', 'Contact Email', 'Contact Phone',
        'Notes', 'Source', 'Qualified'
    ]
    
    # File paths
    master_file = '/Users/max/.openclaw/workspace/postharvest/MASTER-LIST-FINAL.csv'
    new_file = '/Users/max/.openclaw/workspace/postharvest/RESEARCH-FINAL-100.csv'
    
    print("Reading files...")
    
    # Read existing master file
    master_df = pd.read_csv(master_file)
    print(f"Master file: {len(master_df)} records")
    
    # Read new research file
    new_df = pd.read_csv(new_file)
    print(f"New file: {len(new_df)} records")
    
    # Normalize columns for both files
    master_df = normalize_columns(master_df, target_columns)
    new_df = normalize_columns(new_df, target_columns)
    
    # Combine dataframes
    combined_df = pd.concat([master_df, new_df], ignore_index=True)
    print(f"Combined: {len(combined_df)} records")
    
    # Deduplicate using fuzzy matching
    deduped_df = fuzzy_deduplicate(combined_df, threshold=80)
    
    # Sort by Country, State/Region, Company Name
    print("Sorting records...")
    deduped_df = deduped_df.sort_values(['Country', 'State/Region', 'Company Name'], na_position='last')
    
    # Reset index
    deduped_df = deduped_df.reset_index(drop=True)
    
    # Save to master file
    print(f"Saving {len(deduped_df)} records to master file...")
    deduped_df.to_csv(master_file, index=False)
    
    # Prepare data for Google Sheets
    print("Preparing Google Sheets data...")
    
    # Create a copy for sheets with hyperlink formulas
    sheets_df = deduped_df.copy()
    
    # Convert websites to hyperlink formulas
    sheets_df['Website'] = sheets_df['Website'].apply(
        lambda x: create_hyperlink_formula(x, 'auto') if pd.notna(x) and str(x).strip() else ''
    )
    
    # Convert to list of lists for JSON
    headers = list(sheets_df.columns)
    data_rows = sheets_df.fillna('').astype(str).values.tolist()
    all_data = [headers] + data_rows
    
    # Convert to JSON string for gog command
    json_data = json.dumps(all_data)
    
    # Save JSON to file for gog command (to avoid command line length limits)
    json_file = '/Users/max/.openclaw/workspace/postharvest/sheets_data.json'
    with open(json_file, 'w') as f:
        json.dump(all_data, f)
    
    print("Updating Google Sheets...")
    
    # Clear the sheet
    try:
        subprocess.run([
            'gog', 'sheets', 'clear',
            '1uVd-xZFF4TEQGqtvw9z6W8fffeaifPCoLsek83GmEoQ',
            'Sheet1'
        ], check=True, capture_output=True, text=True)
        print("Sheet cleared successfully")
    except subprocess.CalledProcessError as e:
        print(f"Error clearing sheet: {e}")
        print(f"Error output: {e.stderr}")
        return
    
    # Update with new data
    try:
        with open(json_file, 'r') as f:
            json_content = f.read()
        
        result = subprocess.run([
            'gog', 'sheets', 'update',
            '1uVd-xZFF4TEQGqtvw9z6W8fffeaifPCoLsek83GmEoQ',
            'Sheet1!A1',
            '--values-json', json_content,
            '--input', 'USER_ENTERED'
        ], check=True, capture_output=True, text=True)
        print("Sheet updated successfully")
    except subprocess.CalledProcessError as e:
        print(f"Error updating sheet: {e}")
        print(f"Error output: {e.stderr}")
        print(f"Command output: {e.stdout}")
        return
    
    # Format header row
    try:
        subprocess.run([
            'gog', 'sheets', 'format',
            '1uVd-xZFF4TEQGqtvw9z6W8fffeaifPCoLsek83GmEoQ',
            'Sheet1!A1:O1',
            '--format-json', '{"textFormat":{"bold":true,"foregroundColor":{"red":1,"green":1,"blue":1}},"backgroundColor":{"red":0.176,"green":0.416,"blue":0.306}}',
            '--format-fields', 'textFormat,backgroundColor'
        ], check=True, capture_output=True, text=True)
        print("Header formatting applied successfully")
    except subprocess.CalledProcessError as e:
        print(f"Error formatting header: {e}")
        print(f"Error output: {e.stderr}")
    
    # Clean up temporary file
    if os.path.exists(json_file):
        os.remove(json_file)
    
    print(f"\n✅ MERGE COMPLETE!")
    print(f"📊 Final count: {len(deduped_df)} companies")
    print(f"🎯 Target achieved: {len(deduped_df)} > 1,000? {'✅ YES' if len(deduped_df) > 1000 else '❌ NO'}")
    print(f"📁 File saved: {master_file}")
    print(f"📊 Google Sheet updated: https://docs.google.com/spreadsheets/d/1uVd-xZFF4TEQGqtvw9z6W8fffeaifPCoLsek83GmEoQ/")

if __name__ == "__main__":
    main()