#!/usr/bin/env python3
"""
PHT Discovery Engine
Finds new target companies for outreach

Created: March 11, 2026
"""

import csv
import json
import requests
from pathlib import Path
from datetime import datetime

# Paths
WORKSPACE = Path("/Users/max/.openclaw/workspace/postharvest")
OUTPUT_DIR = WORKSPACE / "automation" / "data"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Target criteria
TARGET_CRITERIA = {
    "min_rooms": 10,
    "fruit_types": ["apples", "citrus", "pears", "kiwis", "bananas"],
    "regions": ["USA", "South Africa", "Australia", "New Zealand", "Canada", "UK"],
    "ca_storage": True
}

def load_existing_companies():
    """Load companies already in system to avoid duplicates"""
    existing = set()
    
    # Load from master lists
    master_files = [
        WORKSPACE / "verified-scored-facilities.csv",
        WORKSPACE / "SA-MASTER-LIST-400.csv",
        WORKSPACE / "MASTER-LIST-FINAL.csv"
    ]
    
    for file in master_files:
        if file.exists():
            with open(file, 'r', encoding='utf-8') as f:
                reader = csv.DictReader(f)
                for row in reader:
                    domain = row.get('Domain') or row.get('Website') or row.get('Domain', '')
                    if domain:
                        # Normalize domain
                        domain = domain.lower().replace('http://', '').replace('https://', '').replace('www.', '').strip('/')
                        existing.add(domain)
    
    print(f"✓ Loaded {len(existing)} existing companies")
    return existing

def discover_from_apollo(region, fruit_type, limit=50):
    """
    Discover companies via Apollo API
    Note: Free tier is limited, this is a placeholder for the structure
    """
    # Apollo API key from file
    apollo_key_file = WORKSPACE / ".apollo-key"
    if not apollo_key_file.exists():
        print("⚠ Apollo API key not found, skipping Apollo discovery")
        return []
    
    api_key = apollo_key_file.read_text().strip()
    
    # This would be the actual Apollo API call
    # For now, returning empty since we need to verify API access
    print(f"Apollo discovery for {region} {fruit_type} - API integration pending")
    return []

def discover_from_existing_lists(region, fruit_type, limit=10):
    """
    Pull from existing CSVs that haven't been enriched yet
    """
    companies = []
    existing = load_existing_companies()
    
    # Look for companies in local CSVs that match criteria
    source_files = list(WORKSPACE.glob("*.csv"))
    
    for file in source_files:
        if file.name.startswith('.'):
            continue
            
        try:
            with open(file, 'r', encoding='utf-8') as f:
                reader = csv.DictReader(f)
                for row in reader:
                    company_name = row.get('Company Name') or row.get('Name') or row.get('Company')
                    domain = row.get('Domain') or row.get('Website')
                    country = row.get('Country')
                    
                    if not company_name or not domain:
                        continue
                    
                    # Normalize domain
                    domain_clean = domain.lower().replace('http://', '').replace('https://', '').replace('www.', '').strip('/')
                    
                    # Skip if already processed
                    if domain_clean in existing:
                        continue
                    
                    # Check if matches criteria
                    if region and country and region.lower() not in country.lower():
                        continue
                    
                    companies.append({
                        'company_name': company_name,
                        'domain': domain_clean,
                        'country': country or region,
                        'fruit_type': fruit_type,
                        'source_file': file.name,
                        'discovered_date': datetime.now().isoformat()
                    })
                    
                    if len(companies) >= limit:
                        break
        except Exception as e:
            print(f"⚠ Error reading {file.name}: {e}")
            continue
    
    return companies[:limit]

def run_discovery(region="USA", fruit_type="apples", limit=10, test_mode=True):
    """
    Main discovery function
    
    Args:
        region: Target region
        fruit_type: Target fruit type
        limit: Number of companies to find
        test_mode: If True, only find 10 companies
    """
    print(f"\n🔍 Starting discovery: {region} - {fruit_type}")
    print(f"   Limit: {limit} companies")
    print(f"   Test mode: {test_mode}")
    
    # Load existing to avoid duplicates
    existing = load_existing_companies()
    
    # Discover from various sources
    companies = []
    
    # 1. Check existing lists first (fastest)
    print("\n1️⃣ Searching existing lists...")
    from_existing = discover_from_existing_lists(region, fruit_type, limit)
    companies.extend(from_existing)
    print(f"   Found {len(from_existing)} new companies")
    
    # 2. Apollo (if we need more)
    if len(companies) < limit:
        remaining = limit - len(companies)
        print(f"\n2️⃣ Searching Apollo for {remaining} more...")
        from_apollo = discover_from_apollo(region, fruit_type, remaining)
        companies.extend(from_apollo)
        print(f"   Found {len(from_apollo)} from Apollo")
    
    # Save results
    if companies:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        output_file = OUTPUT_DIR / f"discovered_{region}_{fruit_type}_{timestamp}.csv"
        
        with open(output_file, 'w', newline='', encoding='utf-8') as f:
            fieldnames = ['company_name', 'domain', 'country', 'fruit_type', 'source_file', 'discovered_date']
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(companies)
        
        print(f"\n✅ Discovery complete!")
        print(f"   Found: {len(companies)} companies")
        print(f"   Saved: {output_file}")
        return str(output_file)
    else:
        print("\n⚠ No new companies found")
        return None

if __name__ == "__main__":
    # Test run
    result = run_discovery(
        region="South Africa",
        fruit_type="citrus",
        limit=10,
        test_mode=True
    )
    
    if result:
        print(f"\n📁 Output: {result}")
