#!/usr/bin/env python3
"""
Contact Enrichment Script using Hunter.io API
Enriches facility database with decision-maker contacts
"""

import requests
import csv
import json
import time
from urllib.parse import urlparse

# Hunter.io API key
HUNTER_API_KEY = "fda8536970076bc3228c5b5fa6e19fdc407c43c9"

# Target job titles for cold storage facilities
TARGET_TITLES = [
    "Operations Manager",
    "QA Manager", 
    "Quality Manager",
    "General Manager",
    "CEO",
    "President",
    "Post-Harvest Manager",
    "Cold Storage Manager",
    "Warehouse Manager"
]

def extract_domain(url):
    """Extract domain from URL"""
    if not url or url == 'N/A':
        return None
    if not url.startswith('http'):
        url = 'https://' + url
    parsed = urlparse(url)
    return parsed.netloc.replace('www.', '')

def search_contacts(domain, limit=10):
    """Search for contacts at a domain using Hunter.io"""
    url = f"https://api.hunter.io/v2/domain-search"
    params = {
        'domain': domain,
        'api_key': HUNTER_API_KEY,
        'limit': limit
    }
    
    try:
        response = requests.get(url, params=params, timeout=10)
        response.raise_for_status()
        return response.json()
    except Exception as e:
        print(f"Error searching {domain}: {e}")
        return None

def filter_relevant_contacts(emails):
    """Filter contacts to decision-makers only"""
    relevant = []
    
    for email in emails:
        position = email.get('position', '').lower()
        
        # Check if position matches our targets
        is_relevant = any(title.lower() in position for title in TARGET_TITLES)
        
        # Also include C-level and VP roles
        if any(keyword in position for keyword in ['ceo', 'president', 'vp', 'vice president', 'director', 'manager']):
            is_relevant = True
        
        if is_relevant:
            relevant.append({
                'email': email.get('value'),
                'first_name': email.get('first_name'),
                'last_name': email.get('last_name'),
                'position': email.get('position'),
                'confidence': email.get('confidence'),
                'linkedin': email.get('linkedin'),
                'department': email.get('department'),
                'seniority': email.get('seniority')
            })
    
    return relevant

def enrich_facility(company_name, website, limit=20):
    """Enrich a single facility with contacts"""
    domain = extract_domain(website)
    
    if not domain:
        print(f"⚠️  {company_name}: No valid website")
        return None
    
    print(f"🔍 Searching {company_name} ({domain})...")
    
    result = search_contacts(domain, limit=limit)
    
    if not result or 'data' not in result:
        print(f"   ❌ No data returned")
        return None
    
    data = result['data']
    all_emails = data.get('emails', [])
    pattern = data.get('pattern', 'Unknown')
    
    # Filter to relevant contacts
    contacts = filter_relevant_contacts(all_emails)
    
    print(f"   ✅ Found {len(contacts)} relevant contacts (from {len(all_emails)} total)")
    
    return {
        'company': company_name,
        'domain': domain,
        'email_pattern': pattern,
        'contacts': contacts,
        'total_emails_found': len(all_emails)
    }

def main():
    """Main enrichment workflow"""
    
    # Read top facilities from CSV
    facilities = []
    with open('verified-scored-facilities.csv', 'r') as f:
        reader = csv.DictReader(f)
        for row in reader:
            facilities.append(row)
    
    # Take top 20 by score (handle non-numeric scores)
    def get_score(facility):
        try:
            return int(facility.get('Score', 0))
        except (ValueError, TypeError):
            return 0
    
    top_facilities = sorted(facilities, key=get_score, reverse=True)[:20]
    
    print(f"🚀 Enriching top 20 USA facilities with Hunter.io API\n")
    
    enriched_data = []
    
    for i, facility in enumerate(top_facilities, 1):
        company = facility['Company']
        website = facility['Website']
        score = facility['Score']
        
        print(f"\n[{i}/20] Score: {score} | {company}")
        
        result = enrich_facility(company, website, limit=20)
        
        if result:
            enriched_data.append(result)
        
        # Rate limiting - Hunter.io allows 50 requests/minute on free tier
        time.sleep(1.5)
    
    # Save enriched data to JSON
    output_file = 'enriched-top20-contacts.json'
    with open(output_file, 'w') as f:
        json.dump(enriched_data, f, indent=2)
    
    print(f"\n\n✅ Enrichment complete!")
    print(f"📊 Results: {len(enriched_data)} facilities enriched")
    print(f"💾 Saved to: {output_file}")
    
    # Generate summary
    total_contacts = sum(len(f['contacts']) for f in enriched_data)
    print(f"🎯 Total decision-maker contacts found: {total_contacts}")
    print(f"📧 Average contacts per facility: {total_contacts / len(enriched_data):.1f}")
    
    # Save to CSV for easy review
    csv_output = 'enriched-top20-contacts.csv'
    with open(csv_output, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['Company', 'Domain', 'Email Pattern', 'First Name', 'Last Name', 'Position', 'Email', 'Confidence', 'LinkedIn', 'Department', 'Seniority'])
        
        for facility in enriched_data:
            for contact in facility['contacts']:
                writer.writerow([
                    facility['company'],
                    facility['domain'],
                    facility['email_pattern'],
                    contact.get('first_name', ''),
                    contact.get('last_name', ''),
                    contact.get('position', ''),
                    contact.get('email', ''),
                    contact.get('confidence', ''),
                    contact.get('linkedin', ''),
                    contact.get('department', ''),
                    contact.get('seniority', '')
                ])
    
    print(f"📁 CSV saved to: {csv_output}")
    
    # Show top 3 examples
    print(f"\n📋 Sample Contacts Found:")
    for facility in enriched_data[:3]:
        print(f"\n  {facility['company']} ({facility['domain']})")
        print(f"  Pattern: {facility['email_pattern']}")
        for contact in facility['contacts'][:3]:
            print(f"    • {contact.get('first_name')} {contact.get('last_name')} - {contact.get('position')} ({contact.get('confidence')}%)")

if __name__ == '__main__':
    main()
