#!/usr/bin/env python3
"""
South Africa PHT Prospects - Domain Research & Cleanup
Processes 522+ companies: deduplication, standardization, domain enrichment
"""

import csv
import json
import time
import re
import urllib.parse
import urllib.request
from collections import defaultdict
from typing import Dict, List, Optional

# Hunter.io API key
HUNTER_API_KEY = "fda8536970076bc3228c5b5fa6e19fdc407c43c9"

def clean_ca_rooms(value: str) -> int:
    """Extract numeric CA rooms value from various formats"""
    if not value or value in ['', 'N/A', 'n/a']:
        return 0
    
    # Remove '+' and spaces
    value = str(value).replace('+', '').replace(' ', '').strip()
    
    # Extract first number
    match = re.search(r'\d+', value)
    if match:
        return int(match.group())
    return 0

def standardize_fruit_type(value: str) -> str:
    """Standardize fruit type categories"""
    if not value or value in ['', 'N/A', 'n/a']:
        return 'mixed'
    
    value = value.lower().strip()
    
    # Map variations to standard categories
    if 'apple' in value or 'pear' in value or 'pome' in value or 'deciduous' in value:
        if 'citrus' in value or '/' in value or 'mixed' in value or 'stone' in value:
            return 'mixed'
        return 'apple/pear'
    elif 'citrus' in value:
        if '/' in value or 'mixed' in value or 'stone' in value or 'subtropical' in value:
            return 'mixed'
        return 'citrus'
    elif 'banana' in value:
        return 'banana'
    elif 'mixed' in value or 'multi' in value:
        return 'mixed'
    elif 'grape' in value or 'berry' in value or 'avocado' in value or 'mango' in value:
        return 'other'
    else:
        return 'other'

def clean_domain(domain: str) -> str:
    """Clean and normalize domain names"""
    if not domain or domain in ['', 'N/A', 'n/a']:
        return ''
    
    domain = domain.strip().lower()
    
    # Remove http(s)://
    domain = re.sub(r'^https?://', '', domain)
    
    # Remove www.
    domain = re.sub(r'^www\.', '', domain)
    
    # Remove trailing /
    domain = domain.rstrip('/')
    
    # Extract just domain if it has path
    if '/' in domain:
        domain = domain.split('/')[0]
    
    return domain

def search_hunter_domain(company_name: str) -> Optional[str]:
    """Search for company domain using Hunter.io API"""
    try:
        url = f"https://api.hunter.io/v2/domain-search?company={urllib.parse.quote(company_name)}&api_key={HUNTER_API_KEY}"
        
        with urllib.request.urlopen(url, timeout=10) as response:
            data = json.loads(response.read().decode())
            
            if data.get('data') and data['data'].get('domain'):
                return clean_domain(data['data']['domain'])
    except Exception as e:
        print(f"  Hunter.io error for {company_name}: {e}")
    
    return None

def search_brave(company_name: str) -> Optional[str]:
    """Extract domain from company name using Brave Search"""
    # This would require implementing Brave Search API call
    # For now, return None - will need to be implemented separately
    return None

def verify_domain(domain: str) -> bool:
    """Verify domain is live and accessible"""
    if not domain:
        return False
    
    try:
        url = f"http://{domain}"
        req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
        
        with urllib.request.urlopen(req, timeout=10) as response:
            return response.status == 200
    except:
        # Try HTTPS
        try:
            url = f"https://{domain}"
            req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
            
            with urllib.request.urlopen(req, timeout=10) as response:
                return response.status == 200
        except:
            return False

def find_domain_waterfall(company_name: str) -> Optional[str]:
    """Use waterfall approach to find domain: Hunter.io -> Brave -> manual"""
    print(f"  Searching for: {company_name}")
    
    # Step 1: Hunter.io
    domain = search_hunter_domain(company_name)
    if domain:
        print(f"    ✓ Hunter.io found: {domain}")
        if verify_domain(domain):
            print(f"    ✓ Verified: {domain}")
            return domain
        else:
            print(f"    ✗ Not accessible: {domain}")
    
    # Step 2: Brave Search (would need implementation)
    # domain = search_brave(company_name)
    
    # Add small delay to avoid rate limits
    time.sleep(0.5)
    
    return None

def deduplicate_companies(companies: List[Dict]) -> List[Dict]:
    """Remove duplicate companies by name and domain"""
    seen_names = set()
    seen_domains = set()
    unique = []
    
    for company in companies:
        name_key = company['Name'].lower().strip()
        domain_key = clean_domain(company.get('Domain', ''))
        
        # Skip if duplicate name
        if name_key in seen_names:
            print(f"Skipping duplicate name: {company['Name']}")
            continue
        
        # Skip if duplicate domain (and domain exists)
        if domain_key and domain_key in seen_domains:
            print(f"Skipping duplicate domain: {company['Name']} ({domain_key})")
            continue
        
        seen_names.add(name_key)
        if domain_key:
            seen_domains.add(domain_key)
        
        unique.append(company)
    
    return unique

def process_file(input_path: str, output_path: str):
    """Main processing function"""
    print("=" * 80)
    print("SOUTH AFRICA PHT PROSPECTS - DOMAIN RESEARCH & CLEANUP")
    print("=" * 80)
    
    # Read input CSV
    print("\n1. Reading input file...")
    companies = []
    with open(input_path, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            companies.append(row)
    
    print(f"   Loaded {len(companies)} companies")
    
    # Clean and standardize
    print("\n2. Cleaning and standardizing data...")
    for company in companies:
        # Clean domain
        company['Domain'] = clean_domain(company.get('Domain', ''))
        
        # Standardize fruit type
        fruit_col = '1. Fruit Type      (apple, pear, banana or citrus)'
        company['Fruit Type'] = standardize_fruit_type(company.get(fruit_col, ''))
        
        # Clean CA rooms
        ca_rooms_col = '4. CA Rooms'
        company['CA Rooms'] = clean_ca_rooms(company.get(ca_rooms_col, ''))
    
    # Deduplicate
    print("\n3. Removing duplicates...")
    companies = deduplicate_companies(companies)
    print(f"   {len(companies)} unique companies remaining")
    
    # Research missing domains
    print("\n4. Researching missing domains...")
    missing_domains = [c for c in companies if not c['Domain']]
    print(f"   {len(missing_domains)} companies missing domains")
    
    enriched_count = 0
    for i, company in enumerate(missing_domains, 1):
        if i % 50 == 0:
            print(f"\n   Progress: {i}/{len(missing_domains)} companies processed, {enriched_count} domains found")
        
        domain = find_domain_waterfall(company['Name'])
        if domain:
            company['Domain'] = domain
            enriched_count += 1
    
    print(f"\n   ✓ Found {enriched_count} new domains ({enriched_count/len(missing_domains)*100:.1f}% coverage)")
    
    # Sort by CA rooms (descending)
    print("\n5. Sorting by CA room count...")
    companies.sort(key=lambda x: x['CA Rooms'], reverse=True)
    
    # Add ranking
    for i, company in enumerate(companies, 1):
        company['Rank'] = i
    
    # Write output CSV
    print("\n6. Writing output file...")
    output_columns = [
        'Rank',
        'Name',
        'Domain',
        'Phone',
        'Address',
        'City',
        'Country',
        'Fruit Type',
        'CA Rooms',
        '5. Revenue',
        '6. Hectares',
        'Atmos Score   (based off confidence they are large, have ca storage and storage apple, pears, banana\'s or citrus)',
        'Notes ',
        'Contact 1 Title',
        'Contact 1 Name',
        'Contact 1 Email',
        'Contact 1 Phone',
        'Contact 1 LinkedIn'
    ]
    
    with open(output_path, 'w', encoding='utf-8', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=output_columns, extrasaction='ignore')
        writer.writeheader()
        writer.writerows(companies)
    
    print(f"   ✓ Wrote {len(companies)} companies to {output_path}")
    
    # Summary statistics
    print("\n" + "=" * 80)
    print("SUMMARY")
    print("=" * 80)
    print(f"Total companies: {len(companies)}")
    print(f"Companies with domains: {sum(1 for c in companies if c['Domain'])}")
    print(f"Domain coverage: {sum(1 for c in companies if c['Domain'])/len(companies)*100:.1f}%")
    print(f"\nFruit type breakdown:")
    fruit_types = defaultdict(int)
    for c in companies:
        fruit_types[c['Fruit Type']] += 1
    for fruit_type, count in sorted(fruit_types.items(), key=lambda x: x[1], reverse=True):
        print(f"  {fruit_type}: {count}")
    
    print(f"\nTop 10 companies by CA rooms:")
    for company in companies[:10]:
        domain_status = company['Domain'] or '(no domain)'
        print(f"  {company['Rank']}. {company['Name']} - {company['CA Rooms']} CA rooms - {domain_status}")
    
    print("\n✓ Processing complete!")

if __name__ == '__main__':
    input_file = '/Users/max/.openclaw/media/inbound/file_207---e2f6b0de-cf37-43e2-8a68-8e98114dbfe9.csv'
    output_file = '/Users/max/.openclaw/workspace/postharvest/south-africa-prospects-CLEANED-RANKED.csv'
    
    process_file(input_file, output_file)