#!/usr/bin/env python3
"""
Merge research CSV files with fuzzy deduplication and Google Sheets update
Using only standard library modules
"""

import csv
import re
import json
from difflib import SequenceMatcher
from urllib.parse import urlparse

def normalize_company_name(name):
    """Normalize company name for fuzzy matching"""
    if not name or name.strip() == "":
        return ""
    
    # Convert to lowercase
    name = str(name).lower().strip()
    
    # Remove common suffixes/prefixes
    suffixes = [
        r'\s+inc\.?$', r'\s+llc\.?$', r'\s+ltd\.?$', r'\s+corp\.?$', 
        r'\s+company\.?$', r'\s+co\.?$', r'\s+limited\.?$',
        r'\s+corporation\.?$', r'\s+group\.?$', r'\s+farms?\.?$',
        r'\s+orchards?\.?$', r'\s+growers?\.?$', r'\s+&\s+sons?\.?$',
        r'\s+&\s+co\.?$', r'\s+brothers?\.?$', r'\s+bros\.?$'
    ]
    
    for suffix in suffixes:
        name = re.sub(suffix, '', name)
    
    # Remove "the" prefix
    name = re.sub(r'^the\s+', '', name)
    
    # Remove extra spaces and punctuation
    name = re.sub(r'[^\w\s]', '', name)
    name = re.sub(r'\s+', ' ', name)
    
    return name.strip()

def similarity_score(name1, name2):
    """Calculate similarity score between two company names"""
    norm1 = normalize_company_name(name1)
    norm2 = normalize_company_name(name2)
    
    if not norm1 or not norm2:
        return 0
    
    return SequenceMatcher(None, norm1, norm2).ratio()

def count_filled_fields(record):
    """Count non-empty fields in a record"""
    count = 0
    for value in record.values():
        if value and str(value).strip() and str(value).strip() != "":
            count += 1
    return count

def choose_best_record(record1, record2):
    """Choose the record with more complete data"""
    count1 = count_filled_fields(record1)
    count2 = count_filled_fields(record2)
    
    if count1 > count2:
        return record1
    elif count2 > count1:
        return record2
    else:
        # If equal, prefer the one with more detailed notes
        notes1_len = len(str(record1.get('Notes', '')))
        notes2_len = len(str(record2.get('Notes', '')))
        return record1 if notes1_len >= notes2_len else record2

def format_website_for_sheets(website):
    """Format website as HYPERLINK formula for Google Sheets"""
    if not website or str(website).strip() == "" or str(website).strip() == "N/A":
        return ""
    
    url = str(website).strip()
    
    # Add https:// if no protocol specified
    if not url.startswith(('http://', 'https://')):
        url = 'https://' + url
    
    # Extract display name (domain without www)
    try:
        parsed = urlparse(url)
        display_name = parsed.netloc.lower()
        if display_name.startswith('www.'):
            display_name = display_name[4:]
        return f'=HYPERLINK("{url}","{display_name}")'
    except:
        return url

def read_csv_file(filename):
    """Read CSV file and return list of dictionaries"""
    records = []
    with open(filename, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            records.append(row)
    return records

def write_csv_file(filename, records, fieldnames):
    """Write records to CSV file"""
    with open(filename, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(records)

def main():
    print("Starting data merge process...")
    
    # Define target columns
    target_columns = [
        'Company Name', 'Website', 'Country', 'State/Region', 'Primary Fruit',
        'CA Storage Confirmed', 'Employee Count', 'Est. Revenue ($M)',
        'Contact Name', 'Contact Title', 'Contact Email', 'Contact Phone',
        'Notes', 'Source', 'Qualified'
    ]
    
    # File paths
    files = [
        '/Users/max/.openclaw/workspace/postharvest/MASTER-LIST-FINAL.csv',
        '/Users/max/.openclaw/workspace/postharvest/RESEARCH-WESTERN-COLDCHAIN.csv',
        '/Users/max/.openclaw/workspace/postharvest/RESEARCH-MIDWEST-DISTRIBUTORS.csv',
        '/Users/max/.openclaw/workspace/postharvest/RESEARCH-INTERNATIONAL-2.csv'
    ]
    
    all_records = []
    
    # Read all files
    for file_path in files:
        print(f"Reading {file_path.split('/')[-1]}...")
        records = read_csv_file(file_path)
        
        # Normalize each record to have all target columns
        normalized_records = []
        for record in records:
            normalized_record = {}
            for col in target_columns:
                normalized_record[col] = record.get(col, "")
            normalized_records.append(normalized_record)
        
        all_records.extend(normalized_records)
        print(f"  - Loaded {len(records)} records")
    
    print(f"\nTotal records before deduplication: {len(all_records)}")
    
    # Find duplicates with fuzzy matching
    print("Finding duplicates...")
    duplicates = []
    for i in range(len(all_records)):
        for j in range(i + 1, len(all_records)):
            score = similarity_score(all_records[i]['Company Name'], all_records[j]['Company Name'])
            if score >= 0.85:  # 85% similarity threshold
                duplicates.append((i, j, score))
    
    print(f"Found {len(duplicates)} potential duplicate pairs")
    
    # Resolve duplicates
    to_remove = set()
    
    for i, j, score in sorted(duplicates, key=lambda x: x[2], reverse=True):
        if i not in to_remove and j not in to_remove:
            # Choose the better record
            best_record = choose_best_record(all_records[i], all_records[j])
            
            # Update the first index with the best record, mark the second for removal
            all_records[i] = best_record
            to_remove.add(j)
            
            print(f"  Merged: '{all_records[i]['Company Name']}' (similarity: {score:.2f})")
    
    # Remove duplicates
    final_records = [all_records[i] for i in range(len(all_records)) if i not in to_remove]
    
    print(f"Records after deduplication: {len(final_records)}")
    
    # Sort records
    def sort_key(record):
        country = record.get('Country', '').strip()
        state = record.get('State/Region', '').strip() 
        company = record.get('Company Name', '').strip()
        return (country, state, company)
    
    final_records.sort(key=sort_key)
    
    # Save to CSV
    output_path = '/Users/max/.openclaw/workspace/postharvest/MASTER-LIST-FINAL.csv'
    write_csv_file(output_path, final_records, target_columns)
    print(f"\nSaved merged data to {output_path}")
    
    # Prepare data for Google Sheets
    print("Preparing data for Google Sheets...")
    
    # Format for Google Sheets
    sheets_records = []
    for record in final_records:
        sheets_record = record.copy()
        sheets_record['Website'] = format_website_for_sheets(record['Website'])
        sheets_records.append(sheets_record)
    
    # Convert to 2D array for Google Sheets
    headers = target_columns
    data_rows = []
    
    for record in sheets_records:
        row = []
        for col in headers:
            value = record.get(col, "")
            # Handle empty values
            if value is None or value == "":
                row.append("")
            else:
                row.append(str(value))
        data_rows.append(row)
    
    # Combine headers and data
    sheets_array = [headers] + data_rows
    
    # Save the JSON for the Google Sheets command
    json_file = '/Users/max/.openclaw/workspace/postharvest/sheets_data.json'
    with open(json_file, 'w') as f:
        json.dump(sheets_array, f)
    
    print(f"Prepared {len(final_records)} records for Google Sheets")
    print(f"JSON data saved to {json_file}")
    
    return len(final_records)

if __name__ == "__main__":
    final_count = main()
    print(f"\n✅ Merge complete! Final count: {final_count} unique companies")