#!/usr/bin/env python3
"""
South Africa PHT Prospects - Domain Research & Cleanup
Handles deduplication, domain research, and data standardization
"""

import csv
import json
import re
import requests
import time
from collections import defaultdict
from typing import Dict, List, Optional, Tuple
from urllib.parse import urlparse

# Configuration
HUNTER_API_KEY = "fda8536970076bc3228c5b5fa6e19fdc407c43c9"
INPUT_FILE = "/Users/max/.openclaw/media/inbound/file_207---e2f6b0de-cf37-43e2-8a68-8e98114dbfe9.csv"
OUTPUT_FILE = "/Users/max/.openclaw/workspace/postharvest/south-africa-prospects-CLEANED-RANKED.csv"
PROGRESS_FILE = "/Users/max/.openclaw/workspace/postharvest/progress.json"

# Fruit type standardization mapping
FRUIT_TYPE_MAP = {
    'citrus': 'Citrus',
    'apple/pear': 'Apple/Pear',
    'apple': 'Apple/Pear',
    'pear': 'Apple/Pear',
    'mixed': 'Mixed',
    'multi-fruit': 'Mixed',
    'multi-product': 'Mixed',
    'stone fruit': 'Stone Fruit',
    'stone/pome fruit': 'Mixed',
    'stone/pome/citrus': 'Mixed',
    'deciduous': 'Deciduous',
    'grape': 'Table Grape',
    'table grape': 'Table Grape',
    'grape/citrus': 'Mixed',
    'avocado': 'Avocado',
    'berries': 'Berries',
    'other': 'Other',
    'subtropical': 'Subtropical',
    'citrus/subtropical': 'Mixed',
    'wine grape': 'Wine Grape'
}


class ProgressTracker:
    """Track progress and save state periodically"""
    
    def __init__(self, filename: str):
        self.filename = filename
        self.processed = 0
        self.domains_found = 0
        self.load_progress()
    
    def load_progress(self):
        try:
            with open(self.filename, 'r') as f:
                data = json.load(f)
                self.processed = data.get('processed', 0)
                self.domains_found = data.get('domains_found', 0)
        except FileNotFoundError:
            pass
    
    def save_progress(self):
        with open(self.filename, 'w') as f:
            json.dump({
                'processed': self.processed,
                'domains_found': self.domains_found,
                'timestamp': time.time()
            }, f, indent=2)
    
    def update(self, found_domain: bool = False):
        self.processed += 1
        if found_domain:
            self.domains_found += 1
        
        if self.processed % 50 == 0:
            self.save_progress()
            print(f"\n{'='*60}")
            print(f"PROGRESS UPDATE: {self.processed} companies processed")
            print(f"Domains found: {self.domains_found} ({(self.domains_found/self.processed*100):.1f}%)")
            print(f"{'='*60}\n")


def clean_domain(domain: str) -> Optional[str]:
    """Clean and normalize domain names"""
    if not domain or domain.upper() == 'N/A' or domain.strip() == '':
        return None
    
    # Remove www. prefix and clean
    domain = domain.lower().strip()
    domain = re.sub(r'^https?://', '', domain)
    domain = re.sub(r'^www\.', '', domain)
    domain = domain.split('/')[0]  # Remove paths
    domain = domain.split('?')[0]  # Remove query strings
    
    # Validate basic domain format
    if '.' not in domain or len(domain) < 4:
        return None
    
    return domain


def extract_ca_rooms(ca_rooms_str: str) -> int:
    """Extract numeric CA room count from various formats"""
    if not ca_rooms_str or ca_rooms_str.strip() == '':
        return 0
    
    # Handle formats like "10+", "14+", "N/A", etc.
    ca_rooms_str = str(ca_rooms_str).strip().upper()
    
    if ca_rooms_str == 'N/A' or ca_rooms_str == '':
        return 0
    
    # Extract first number found
    match = re.search(r'(\d+)', ca_rooms_str)
    if match:
        return int(match.group(1))
    
    return 0


def standardize_fruit_type(fruit_type: str) -> str:
    """Standardize fruit type categories"""
    if not fruit_type:
        return 'Unknown'
    
    fruit_type_lower = fruit_type.lower().strip()
    
    for key, value in FRUIT_TYPE_MAP.items():
        if key in fruit_type_lower:
            return value
    
    return 'Mixed'


def search_domain_hunter(company_name: str) -> Optional[str]:
    """Search for company domain using Hunter.io API"""
    try:
        url = "https://api.hunter.io/v2/domain-search"
        params = {
            'company': company_name,
            'api_key': HUNTER_API_KEY
        }
        
        response = requests.get(url, params=params, timeout=10)
        
        if response.status_code == 200:
            data = response.json()
            if data.get('data') and data['data'].get('domain'):
                domain = data['data']['domain']
                print(f"  ✓ Hunter.io found: {domain}")
                return clean_domain(domain)
        
        time.sleep(0.5)  # Rate limiting
        return None
    
    except Exception as e:
        print(f"  ✗ Hunter.io error: {e}")
        return None


def verify_domain(domain: str) -> bool:
    """Verify domain is accessible"""
    try:
        # Try HTTPS first
        response = requests.head(f"https://{domain}", timeout=5, allow_redirects=True)
        if response.status_code < 500:
            return True
        
        # Fallback to HTTP
        response = requests.head(f"http://{domain}", timeout=5, allow_redirects=True)
        return response.status_code < 500
    
    except:
        return False


def read_and_clean_csv(filename: str) -> List[Dict]:
    """Read CSV and perform initial cleanup"""
    companies = []
    seen_names = set()
    seen_domains = set()
    
    with open(filename, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        
        for row in reader:
            name = row['Name'].strip()
            domain = clean_domain(row.get('Domain', ''))
            
            # Skip obvious duplicates
            if name.lower() in seen_names:
                print(f"Skipping duplicate: {name}")
                continue
            
            if domain and domain in seen_domains:
                print(f"Skipping duplicate domain: {domain} ({name})")
                continue
            
            # Extract and clean data
            ca_rooms = extract_ca_rooms(row.get('4. CA Rooms', '0'))
            fruit_type = standardize_fruit_type(row.get('1. Fruit Type      (apple, pear, banana or citrus)', ''))
            
            company = {
                'Name': name,
                'Domain': domain,
                'Phone': row.get('Phone', '').strip(),
                'Address': row.get('Address', '').strip(),
                'City': row.get('City', '').strip(),
                'Country': 'South Africa',
                'Fruit Type': fruit_type,
                'CA Rooms': ca_rooms,
                'Revenue': row.get('5. Revenue', '').strip(),
                'Hectares': row.get('6. Hectares', '').strip(),
                'Atmos Score': row.get('Atmos Score   (based off confidence they are large, have ca storage and storage apple, pears, banana\'s or citrus)', '').strip(),
                'Notes': row.get('Notes ', '').strip(),
                'Contact 1 Title': row.get('Contact 1 Title', '').strip(),
                'Contact 1 Name': row.get('Contact 1 Name', '').strip(),
                'Contact 1 Email': row.get('Contact 1 Email', '').strip(),
                'Contact 1 Phone': row.get('Contact 1 Phone', '').strip(),
                'Contact 1 LinkedIn': row.get('Contact 1 LinkedIn', '').strip(),
            }
            
            companies.append(company)
            seen_names.add(name.lower())
            if domain:
                seen_domains.add(domain)
    
    print(f"\n✓ Loaded {len(companies)} unique companies after initial deduplication")
    return companies


def research_missing_domains(companies: List[Dict], tracker: ProgressTracker) -> List[Dict]:
    """Research missing domains for all companies"""
    
    for i, company in enumerate(companies, 1):
        name = company['Name']
        
        # Skip if domain already exists
        if company['Domain']:
            tracker.update(found_domain=True)
            print(f"[{i}/{len(companies)}] {name}: Already has domain {company['Domain']}")
            continue
        
        print(f"\n[{i}/{len(companies)}] Researching: {name}")
        
        # Try Hunter.io first
        domain = search_domain_hunter(name)
        
        if domain and verify_domain(domain):
            company['Domain'] = domain
            tracker.update(found_domain=True)
            print(f"  ✓ Found and verified: {domain}")
            continue
        
        # If Hunter fails, we'll use web_search via OpenClaw CLI
        # For now, mark as not found
        tracker.update(found_domain=False)
        print(f"  ✗ No domain found via Hunter.io")
        
        # Rate limiting
        time.sleep(1)
    
    return companies


def rank_companies(companies: List[Dict]) -> List[Dict]:
    """Rank companies by CA room count (descending)"""
    # Sort by CA Rooms (descending), then by name
    sorted_companies = sorted(companies, key=lambda x: (-x['CA Rooms'], x['Name']))
    
    # Add rank
    for i, company in enumerate(sorted_companies, 1):
        company['Rank'] = i
    
    return sorted_companies


def save_results(companies: List[Dict], filename: str):
    """Save cleaned and ranked results to CSV"""
    fieldnames = [
        'Rank', 'Name', 'Domain', 'Phone', 'Address', 'City', 'Country',
        'Fruit Type', 'CA Rooms', 'Revenue', 'Hectares', 'Atmos Score', 'Notes',
        'Contact 1 Title', 'Contact 1 Name', 'Contact 1 Email', 'Contact 1 Phone', 
        'Contact 1 LinkedIn'
    ]
    
    with open(filename, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(companies)
    
    print(f"\n✓ Saved {len(companies)} companies to {filename}")


def main():
    print("="*60)
    print("SOUTH AFRICA PHT PROSPECTS - DOMAIN RESEARCH & CLEANUP")
    print("="*60)
    
    # Initialize progress tracker
    tracker = ProgressTracker(PROGRESS_FILE)
    
    # Step 1: Read and clean CSV
    print("\nStep 1: Reading and cleaning CSV...")
    companies = read_and_clean_csv(INPUT_FILE)
    
    # Step 2: Research missing domains
    print("\nStep 2: Researching missing domains...")
    companies = research_missing_domains(companies, tracker)
    
    # Step 3: Rank companies
    print("\nStep 3: Ranking companies by CA room count...")
    companies = rank_companies(companies)
    
    # Step 4: Save results
    print("\nStep 4: Saving results...")
    save_results(companies, OUTPUT_FILE)
    
    # Final statistics
    total = len(companies)
    with_domains = sum(1 for c in companies if c['Domain'])
    coverage = (with_domains / total * 100) if total > 0 else 0
    
    print("\n" + "="*60)
    print("FINAL STATISTICS")
    print("="*60)
    print(f"Total companies: {total}")
    print(f"Companies with domains: {with_domains} ({coverage:.1f}%)")
    print(f"Companies without domains: {total - with_domains}")
    print(f"\nOutput file: {OUTPUT_FILE}")
    print("="*60)


if __name__ == "__main__":
    main()
