#!/usr/bin/env python3
"""
Facility Verification & Scoring System
Researches cold storage facilities and scores them based on size and produce types
"""

import csv
import json
import re
import time
from pathlib import Path
from typing import Dict, List, Optional, Tuple
import requests
from urllib.parse import urlparse, urljoin
from dataclasses import dataclass, asdict
from datetime import datetime

@dataclass
class FacilityData:
    company: str
    region: str
    website: str = ""
    size_classification: str = "Unknown"
    total_rooms: str = ""
    square_footage: str = ""
    primary_produce: str = ""
    premium_varieties: str = ""
    organic: str = "Unknown"
    ca_ma_storage: str = "Unknown"
    score: int = 0
    verification_source: str = ""
    confidence_level: str = "Unknown"
    notes: str = ""
    
    def to_dict(self):
        return asdict(self)

class FacilityVerifier:
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
        })
        self.results = []
        
        # Scoring parameters
        self.size_scores = {
            'XXLarge': 100,  # 50+ rooms OR 500K+ sqft
            'XLarge': 90,    # 30-49 rooms OR 300-500K sqft
            'Large': 80,     # 20-29 rooms OR 150-300K sqft
            'Medium': 70,    # 10-19 rooms OR 50-150K sqft
            'Small': 60      # under 10 rooms
        }
        
        # Premium apple varieties
        self.premium_apples = [
            'jazz', 'envy', 'cosmic crisp', 'pacific rose', 'rockit',
            'honeycrisp', 'pink lady', 'kanzi', 'sweetango', 'snap dragon'
        ]
        
        # High-value produce
        self.high_value_produce = [
            'avocado', 'blueberr', 'strawberr', 'raspberr', 'blackberr',
            'kiwi', 'cherry', 'citrus', 'pear'
        ]
        
    def classify_size(self, rooms: str, sqft: str) -> str:
        """Classify facility size based on rooms or square footage"""
        try:
            # Try to extract number from rooms
            room_num = None
            if rooms:
                match = re.search(r'(\d+)', rooms.replace(',', ''))
                if match:
                    room_num = int(match.group(1))
            
            # Try to extract number from square footage
            sqft_num = None
            if sqft:
                match = re.search(r'(\d+)', sqft.replace(',', ''))
                if match:
                    sqft_num = int(match.group(1))
            
            # Classify based on available data
            if room_num:
                if room_num >= 50:
                    return 'XXLarge'
                elif room_num >= 30:
                    return 'XLarge'
                elif room_num >= 20:
                    return 'Large'
                elif room_num >= 10:
                    return 'Medium'
                else:
                    return 'Small'
            
            if sqft_num:
                if sqft_num >= 500000:
                    return 'XXLarge'
                elif sqft_num >= 300000:
                    return 'XLarge'
                elif sqft_num >= 150000:
                    return 'Large'
                elif sqft_num >= 50000:
                    return 'Medium'
                else:
                    return 'Small'
                    
        except Exception as e:
            print(f"Error classifying size: {e}")
        
        return 'Unknown'
    
    def calculate_score(self, facility: FacilityData) -> int:
        """Calculate facility score based on algorithm"""
        score = 0
        
        # Base score by size
        score = self.size_scores.get(facility.size_classification, 0)
        
        # Produce bonuses
        produce_lower = facility.primary_produce.lower()
        varieties_lower = facility.premium_varieties.lower()
        
        # Premium apple varieties (+10)
        if any(variety in varieties_lower or variety in produce_lower 
               for variety in self.premium_apples):
            score += 10
            
        # High-value produce (+10)
        if any(produce in produce_lower for produce in self.high_value_produce):
            score += 10
        
        # Multi-fruit operations 3+ types (+5)
        fruit_count = len(re.findall(r'\w+', produce_lower.split(',')[0] if ',' in produce_lower else ''))
        if fruit_count >= 3 or produce_lower.count(',') >= 2:
            score += 5
            
        # Organic certified (+5)
        if 'yes' in facility.organic.lower() or 'certified' in facility.organic.lower():
            score += 5
            
        # CA/MA storage (+5)
        if 'yes' in facility.ca_ma_storage.lower():
            score += 5
        
        return score
    
    def search_web_for_facility(self, company: str, region: str) -> Dict:
        """Search web for facility information using Brave Search API"""
        # This would use the web_search tool in the actual implementation
        # For now, return placeholder
        return {
            'rooms': '',
            'sqft': '',
            'produce': '',
            'premium_varieties': '',
            'organic': 'Unknown',
            'ca_ma': 'Unknown',
            'source': 'Web Search',
            'confidence': 'Estimated'
        }
    
    def scrape_website(self, url: str) -> Dict:
        """Scrape facility website for information"""
        if not url or url == '':
            return {}
        
        try:
            response = self.session.get(url, timeout=10)
            if response.status_code != 200:
                return {}
            
            content = response.text.lower()
            
            # Look for room/chamber counts
            rooms = ''
            room_patterns = [
                r'(\d+)\s*(?:controlled atmosphere|ca|ma)?\s*(?:rooms?|chambers?)',
                r'(\d+)\s*storage\s*rooms?',
                r'(\d+)\+?\s*rooms?'
            ]
            for pattern in room_patterns:
                match = re.search(pattern, content)
                if match:
                    rooms = match.group(1) + '+'
                    break
            
            # Look for square footage
            sqft = ''
            sqft_patterns = [
                r'([\d,]+)\s*(?:square\s*feet|sq\.?\s*ft\.?|sf)',
                r'([\d,]+)\s*ft²'
            ]
            for pattern in sqft_patterns:
                match = re.search(pattern, content)
                if match:
                    sqft = match.group(1).replace(',', '')
                    break
            
            # Look for produce types
            produce_types = []
            produce_keywords = [
                'apple', 'citrus', 'orange', 'lemon', 'grapefruit', 'mandarin',
                'pear', 'cherry', 'berr', 'kiwi', 'avocado', 'peach', 'plum',
                'nectarine', 'apricot'
            ]
            for keyword in produce_keywords:
                if keyword in content:
                    produce_types.append(keyword)
            
            # Look for premium varieties
            premium_found = []
            for variety in self.premium_apples:
                if variety in content:
                    premium_found.append(variety)
            
            # Check for organic
            organic = 'Unknown'
            if 'organic' in content and ('certified' in content or 'usda organic' in content):
                organic = 'Yes'
            
            # Check for CA/MA
            ca_ma = 'Unknown'
            if 'controlled atmosphere' in content or 'ca storage' in content or 'ma storage' in content:
                ca_ma = 'Yes'
            
            return {
                'rooms': rooms,
                'sqft': sqft,
                'produce': ', '.join(set(produce_types[:5])) if produce_types else '',
                'premium_varieties': ', '.join(premium_found) if premium_found else '',
                'organic': organic,
                'ca_ma': ca_ma,
                'source': 'Website Scrape',
                'confidence': 'Verified'
            }
            
        except Exception as e:
            print(f"Error scraping {url}: {e}")
            return {}
    
    def verify_facility(self, company: str, website: str, region: str, 
                       existing_rooms: str, existing_produce: str, 
                       existing_ca: str) -> FacilityData:
        """Verify a single facility"""
        
        facility = FacilityData(
            company=company,
            region=region,
            website=website,
            total_rooms=existing_rooms,
            primary_produce=existing_produce,
            ca_ma_storage='Yes' if existing_ca and existing_ca.lower() == 'yes' else 'Unknown'
        )
        
        # Try website scraping first
        if website:
            print(f"  Scraping {website}...")
            web_data = self.scrape_website(website)
            
            if web_data:
                # Update facility data with scraped info
                if web_data.get('rooms') and not existing_rooms:
                    facility.total_rooms = web_data['rooms']
                if web_data.get('sqft'):
                    facility.square_footage = web_data['sqft']
                if web_data.get('produce'):
                    facility.primary_produce = web_data['produce']
                if web_data.get('premium_varieties'):
                    facility.premium_varieties = web_data['premium_varieties']
                if web_data.get('organic') != 'Unknown':
                    facility.organic = web_data['organic']
                if web_data.get('ca_ma') != 'Unknown':
                    facility.ca_ma_storage = web_data['ca_ma']
                
                facility.verification_source = web_data.get('source', 'Website')
                facility.confidence_level = web_data.get('confidence', 'Verified')
        
        # Classify size
        facility.size_classification = self.classify_size(
            facility.total_rooms, 
            facility.square_footage
        )
        
        # Calculate score
        facility.score = self.calculate_score(facility)
        
        return facility
    
    def process_batch(self, facilities: List[Dict], start_idx: int = 0, 
                     batch_size: int = 20) -> List[FacilityData]:
        """Process a batch of facilities"""
        
        results = []
        end_idx = min(start_idx + batch_size, len(facilities))
        
        print(f"\nProcessing batch {start_idx}-{end_idx} of {len(facilities)} facilities...")
        
        for i in range(start_idx, end_idx):
            facility_input = facilities[i]
            company = facility_input.get('Company', '')
            
            print(f"\n[{i+1}/{len(facilities)}] {company}")
            
            verified = self.verify_facility(
                company=company,
                website=facility_input.get('Website', ''),
                region=facility_input.get('Region', ''),
                existing_rooms=facility_input.get('Rooms', ''),
                existing_produce=facility_input.get('Primary Fruit', ''),
                existing_ca=facility_input.get('CA Storage', '')
            )
            
            results.append(verified)
            
            # Rate limiting
            time.sleep(1)
        
        return results
    
    def save_results(self, results: List[FacilityData], output_path: str):
        """Save results to CSV"""
        
        fieldnames = [
            'Company', 'Region', 'Website', 'Size Classification', 
            'Total Rooms', 'Square Footage', 'Primary Produce', 
            'Premium Varieties', 'Organic', 'CA/MA', 'Score', 
            'Verification Source', 'Confidence Level', 'Notes'
        ]
        
        with open(output_path, 'w', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            writer.writeheader()
            
            for facility in results:
                row = {
                    'Company': facility.company,
                    'Region': facility.region,
                    'Website': facility.website,
                    'Size Classification': facility.size_classification,
                    'Total Rooms': facility.total_rooms,
                    'Square Footage': facility.square_footage,
                    'Primary Produce': facility.primary_produce,
                    'Premium Varieties': facility.premium_varieties,
                    'Organic': facility.organic,
                    'CA/MA': facility.ca_ma_storage,
                    'Score': facility.score,
                    'Verification Source': facility.verification_source,
                    'Confidence Level': facility.confidence_level,
                    'Notes': facility.notes
                }
                writer.writerow(row)
        
        print(f"\nSaved {len(results)} verified facilities to {output_path}")

def main():
    """Main execution"""
    
    workspace = Path('/Users/max/.openclaw/workspace/postharvest')
    input_file = workspace / 'usa-cold-storage-master.csv'
    output_file = workspace / 'verified-scored-facilities.csv'
    
    # Load facilities
    print(f"Loading facilities from {input_file}...")
    facilities = []
    with open(input_file, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        facilities = list(reader)
    
    print(f"Loaded {len(facilities)} facilities")
    
    # Initialize verifier
    verifier = FacilityVerifier()
    
    # Process in batches
    all_results = []
    batch_size = 30
    
    # Start with first 200 (prioritized)
    target_count = min(200, len(facilities))
    
    for start_idx in range(0, target_count, batch_size):
        batch_results = verifier.process_batch(facilities, start_idx, batch_size)
        all_results.extend(batch_results)
        
        # Save progress after each batch
        verifier.save_results(all_results, output_file)
        
        print(f"\nCompleted {len(all_results)}/{target_count} facilities")
        
        # Sleep between batches
        if start_idx + batch_size < target_count:
            print("Sleeping 5 seconds before next batch...")
            time.sleep(5)
    
    # Sort by score (highest first)
    all_results.sort(key=lambda x: x.score, reverse=True)
    
    # Final save
    verifier.save_results(all_results, output_file)
    
    # Summary statistics
    print("\n" + "="*60)
    print("VERIFICATION COMPLETE")
    print("="*60)
    print(f"Total facilities verified: {len(all_results)}")
    print(f"\nSize Distribution:")
    for size in ['XXLarge', 'XLarge', 'Large', 'Medium', 'Small', 'Unknown']:
        count = sum(1 for f in all_results if f.size_classification == size)
        print(f"  {size}: {count}")
    
    print(f"\nTop 10 Highest Scoring Facilities:")
    for i, facility in enumerate(all_results[:10], 1):
        print(f"  {i}. {facility.company} ({facility.region})")
        print(f"      Score: {facility.score} | Size: {facility.size_classification} | " +
              f"Rooms: {facility.total_rooms or 'Unknown'}")
    
    print(f"\nResults saved to: {output_file}")

if __name__ == '__main__':
    main()