#!/usr/bin/env python3

import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz
import json

def normalize_columns(df, target_columns):
    """Normalize dataframe to target column structure"""
    normalized = pd.DataFrame(columns=target_columns)
    
    # Map existing columns
    for col in df.columns:
        if col in target_columns:
            normalized[col] = df[col]
    
    # Fill missing columns with empty strings
    for col in target_columns:
        if col not in normalized.columns:
            normalized[col] = ''
    
    return normalized[target_columns]

def calculate_data_score(row):
    """Calculate how much data a row has (for deduplication priority)"""
    score = 0
    for value in row.values:
        if pd.notna(value) and str(value).strip() != '':
            score += 1
    return score

def fuzzy_deduplicate(df, threshold=80):
    """Remove duplicates based on fuzzy company name matching"""
    df = df.copy()
    df['data_score'] = df.apply(calculate_data_score, axis=1)
    df = df.sort_values('data_score', ascending=False).reset_index(drop=True)
    
    to_remove = set()
    
    for i in range(len(df)):
        if i in to_remove:
            continue
            
        current_name = str(df.loc[i, 'Company Name']).strip().lower()
        
        for j in range(i + 1, len(df)):
            if j in to_remove:
                continue
                
            compare_name = str(df.loc[j, 'Company Name']).strip().lower()
            
            # Calculate fuzzy ratio
            similarity = fuzz.ratio(current_name, compare_name)
            
            if similarity >= threshold:
                print(f"Found duplicate: '{df.loc[i, 'Company Name']}' vs '{df.loc[j, 'Company Name']}' (similarity: {similarity}%)")
                to_remove.add(j)  # Remove the one with less data (j, since sorted by data_score)
    
    # Remove duplicates
    df_clean = df.drop(index=to_remove).drop(columns=['data_score']).reset_index(drop=True)
    print(f"Removed {len(to_remove)} duplicates")
    
    return df_clean

# Define target columns
target_columns = [
    'Company Name', 'Website', 'Country', 'State/Region', 'Primary Fruit',
    'CA Storage Confirmed', 'Employee Count', 'Est. Revenue ($M)',
    'Contact Name', 'Contact Title', 'Contact Email', 'Contact Phone',
    'Notes', 'Source', 'Qualified'
]

print("Reading existing master file...")
master_df = pd.read_csv('/Users/max/.openclaw/workspace/postharvest/MASTER-LIST-FINAL.csv')
print(f"Master file: {len(master_df)} companies")

print("Reading new research file...")
research_df = pd.read_csv('/Users/max/.openclaw/workspace/postharvest/RESEARCH-FINAL-GAP.csv')
print(f"Research file: {len(research_df)} companies")

print("Normalizing column structures...")
master_normalized = normalize_columns(master_df, target_columns)
research_normalized = normalize_columns(research_df, target_columns)

print("Combining datasets...")
combined_df = pd.concat([master_normalized, research_normalized], ignore_index=True)
print(f"Combined total: {len(combined_df)} companies")

print("Performing fuzzy deduplication (80% threshold)...")
final_df = fuzzy_deduplicate(combined_df, threshold=80)

print("Sorting by Country, State/Region, Company Name...")
final_df = final_df.sort_values(['Country', 'State/Region', 'Company Name']).reset_index(drop=True)

print(f"Final count: {len(final_df)} companies")

print("Saving merged file...")
final_df.to_csv('/Users/max/.openclaw/workspace/postharvest/MASTER-LIST-FINAL.csv', index=False)

print("Creating Google Sheets JSON data...")
# Convert DataFrame to list of lists for Google Sheets
sheets_data = [final_df.columns.tolist()]  # Header row

for _, row in final_df.iterrows():
    row_data = []
    for i, value in enumerate(row):
        if i == 1 and pd.notna(value) and str(value).strip():  # Website column
            # Create HYPERLINK formula if it's a valid URL
            website = str(value).strip()
            if website.startswith(('http://', 'https://')):
                # Extract domain for display
                domain = website.replace('https://', '').replace('http://', '').split('/')[0]
                row_data.append(f'=HYPERLINK("{website}","{domain}")')
            else:
                row_data.append(website)
        else:
            # Convert to string and handle NaN/None values
            if pd.isna(value):
                row_data.append('')
            else:
                row_data.append(str(value))
    sheets_data.append(row_data)

# Save JSON for Google Sheets
with open('/Users/max/.openclaw/workspace/postharvest/sheets_data.json', 'w') as f:
    json.dump(sheets_data, f)

print(f"\n=== FINAL RESULTS ===")
print(f"Companies processed: {len(combined_df)}")
print(f"Final unique companies: {len(final_df)}")
print(f"Goal achieved: {len(final_df) > 1000}")
print(f"Files saved:")
print(f"  - /Users/max/.openclaw/workspace/postharvest/MASTER-LIST-FINAL.csv")
print(f"  - /Users/max/.openclaw/workspace/postharvest/sheets_data.json")