#!/usr/bin/env python3
"""
South Africa PHT Prospects - Cleanup & Deduplication ONLY
Domain research will be done separately via Brave Search
"""

import csv
import re
from collections import defaultdict
from typing import Dict, List

def clean_ca_rooms(value: str) -> int:
    """Extract numeric CA rooms value from various formats"""
    if not value or value in ['', 'N/A', 'n/a']:
        return 0
    
    # Remove '+' and spaces
    value = str(value).replace('+', '').replace(' ', '').strip()
    
    # Extract first number
    match = re.search(r'\d+', value)
    if match:
        return int(match.group())
    return 0

def standardize_fruit_type(value: str) -> str:
    """Standardize fruit type categories"""
    if not value or value in ['', 'N/A', 'n/a']:
        return 'mixed'
    
    value = value.lower().strip()
    
    # Map variations to standard categories
    if 'apple' in value or 'pear' in value or 'pome' in value or 'deciduous' in value:
        if 'citrus' in value or '/' in value or 'mixed' in value or 'stone' in value:
            return 'mixed'
        return 'apple/pear'
    elif 'citrus' in value:
        if '/' in value or 'mixed' in value or 'stone' in value or 'subtropical' in value:
            return 'mixed'
        return 'citrus'
    elif 'banana' in value:
        return 'banana'
    elif 'mixed' in value or 'multi' in value:
        return 'mixed'
    elif 'grape' in value or 'berry' in value or 'avocado' in value or 'mango' in value:
        return 'other'
    else:
        return 'other'

def clean_domain(domain: str) -> str:
    """Clean and normalize domain names"""
    if not domain or domain in ['', 'N/A', 'n/a']:
        return ''
    
    domain = domain.strip().lower()
    
    # Remove http(s)://
    domain = re.sub(r'^https?://', '', domain)
    
    # Remove www.
    domain = re.sub(r'^www\.', '', domain)
    
    # Remove trailing /
    domain = domain.rstrip('/')
    
    # Extract just domain if it has path
    if '/' in domain:
        domain = domain.split('/')[0]
    
    return domain

def deduplicate_companies(companies: List[Dict]) -> List[Dict]:
    """Remove duplicate companies by name and domain"""
    seen_names = set()
    seen_domains = set()
    unique = []
    
    for company in companies:
        name_key = company['Name'].lower().strip()
        domain_key = clean_domain(company.get('Domain', ''))
        
        # Skip if duplicate name
        if name_key in seen_names:
            print(f"  Skipping duplicate name: {company['Name']}")
            continue
        
        # Skip if duplicate domain (and domain exists)
        if domain_key and domain_key in seen_domains:
            print(f"  Skipping duplicate domain: {company['Name']} ({domain_key})")
            continue
        
        seen_names.add(name_key)
        if domain_key:
            seen_domains.add(domain_key)
        
        unique.append(company)
    
    return unique

def process_file(input_path: str, output_path: str, missing_domains_path: str):
    """Main processing function"""
    print("=" * 80)
    print("SOUTH AFRICA PHT PROSPECTS - CLEANUP & DEDUPLICATION")
    print("=" * 80)
    
    # Read input CSV
    print("\n1. Reading input file...")
    companies = []
    with open(input_path, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            companies.append(row)
    
    print(f"   Loaded {len(companies)} companies")
    
    # Clean and standardize
    print("\n2. Cleaning and standardizing data...")
    for company in companies:
        # Clean domain
        company['Domain'] = clean_domain(company.get('Domain', ''))
        
        # Standardize fruit type
        fruit_col = '1. Fruit Type      (apple, pear, banana or citrus)'
        company['Fruit Type'] = standardize_fruit_type(company.get(fruit_col, ''))
        
        # Clean CA rooms
        ca_rooms_col = '4. CA Rooms'
        company['CA Rooms'] = clean_ca_rooms(company.get(ca_rooms_col, ''))
    
    # Deduplicate
    print("\n3. Removing duplicates...")
    companies = deduplicate_companies(companies)
    print(f"   ✓ {len(companies)} unique companies remaining")
    
    # Sort by CA rooms (descending)
    print("\n4. Sorting by CA room count...")
    companies.sort(key=lambda x: x['CA Rooms'], reverse=True)
    
    # Add ranking
    for i, company in enumerate(companies, 1):
        company['Rank'] = i
    
    # Extract companies needing domain research
    missing_domains = [c for c in companies if not c['Domain']]
    print(f"\n5. Extracting companies missing domains...")
    print(f"   {len(missing_domains)} companies need domain research")
    
    # Write missing domains list for manual research
    with open(missing_domains_path, 'w', encoding='utf-8') as f:
        f.write("COMPANIES MISSING DOMAINS - FOR WEB SEARCH\n")
        f.write("=" * 80 + "\n\n")
        for company in missing_domains:
            f.write(f"{company['Name']}\n")
            f.write(f"  City: {company.get('City', 'Unknown')}\n")
            f.write(f"  Fruit: {company['Fruit Type']}\n")
            f.write(f"  CA Rooms: {company['CA Rooms']}\n")
            f.write(f"  Search query: {company['Name']} South Africa website\n")
            f.write("\n")
    
    print(f"   ✓ Wrote missing domains list to {missing_domains_path}")
    
    # Write output CSV
    print("\n6. Writing output file...")
    output_columns = [
        'Rank',
        'Name',
        'Domain',
        'Phone',
        'Address',
        'City',
        'Country',
        'Fruit Type',
        'CA Rooms',
        '5. Revenue',
        '6. Hectares',
        'Atmos Score   (based off confidence they are large, have ca storage and storage apple, pears, banana\'s or citrus)',
        'Notes ',
        'Contact 1 Title',
        'Contact 1 Name',
        'Contact 1 Email',
        'Contact 1 Phone',
        'Contact 1 LinkedIn'
    ]
    
    with open(output_path, 'w', encoding='utf-8', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=output_columns, extrasaction='ignore')
        writer.writeheader()
        writer.writerows(companies)
    
    print(f"   ✓ Wrote {len(companies)} companies to {output_path}")
    
    # Summary statistics
    print("\n" + "=" * 80)
    print("SUMMARY")
    print("=" * 80)
    print(f"Total companies: {len(companies)}")
    print(f"Companies with domains: {sum(1 for c in companies if c['Domain'])}")
    print(f"Companies missing domains: {len(missing_domains)}")
    print(f"Current domain coverage: {sum(1 for c in companies if c['Domain'])/len(companies)*100:.1f}%")
    
    print(f"\nFruit type breakdown:")
    fruit_types = defaultdict(int)
    for c in companies:
        fruit_types[c['Fruit Type']] += 1
    for fruit_type, count in sorted(fruit_types.items(), key=lambda x: x[1], reverse=True):
        print(f"  {fruit_type}: {count}")
    
    print(f"\nTop 10 companies by CA rooms:")
    for company in companies[:10]:
        domain_status = company['Domain'] or '(NEEDS RESEARCH)'
        print(f"  {company['Rank']}. {company['Name']} - {company['CA Rooms']} CA rooms - {domain_status}")
    
    print(f"\nCompanies missing domains (10+ CA rooms): {sum(1 for c in missing_domains if c['CA Rooms'] >= 10)}")
    print(f"Companies missing domains (20+ CA rooms): {sum(1 for c in missing_domains if c['CA Rooms'] >= 20)}")
    print(f"Companies missing domains (30+ CA rooms): {sum(1 for c in missing_domains if c['CA Rooms'] >= 30)}")
    
    print("\n✓ Cleanup complete! Next step: Domain research via Brave Search")

if __name__ == '__main__':
    input_file = '/Users/max/.openclaw/media/inbound/file_207---e2f6b0de-cf37-43e2-8a68-8e98114dbfe9.csv'
    output_file = '/Users/max/.openclaw/workspace/postharvest/south-africa-prospects-CLEANED-RANKED.csv'
    missing_file = '/Users/max/.openclaw/workspace/postharvest/sa-missing-domains.txt'
    
    process_file(input_file, output_file, missing_file)
