#!/usr/bin/env python3
"""
Systematic facility verification script
Processes facilities in batches to gather:
- Exact room count
- Square footage
- Specific produce varieties
- Organic certification
- CA/MA capabilities
"""

import csv
import json
from datetime import datetime

def load_facilities_csv(filepath='verified-scored-facilities.csv'):
    """Load the CSV and identify facilities needing verification"""
    facilities = []
    with open(filepath, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for idx, row in enumerate(reader, start=2):  # Start at 2 (header is 1)
            # Check if needs verification (Confirmed vs Verified)
            if row['Confidence Level'] == 'Confirmed':
                facilities.append({
                    'row': idx,
                    'company': row['Company'],
                    'region': row['Region'],
                    'website': row['Website'],
                    'size_class': row['Size Classification'],
                    'total_rooms': row['Total Rooms'],
                    'sq_ft': row['Square Footage'],
                    'primary_produce': row['Primary Produce'],
                    'organic': row['Organic'],
                    'ca_ma': row['CA/MA'],
                    'score': row['Score'],
                    'source': row['Verification Source'],
                    'notes': row['Notes']
                })
    return facilities

def save_research_results(facility_updates, output_file='facility_updates.json'):
    """Save research findings"""
    with open(output_file, 'w') as f:
        json.dump(facility_updates, f, indent=2)
    print(f"Saved {len(facility_updates)} facility updates to {output_file}")

def generate_search_queries(facility):
    """Generate effective search queries for a facility"""
    queries = []
    company = facility['company']
    region = facility['region']
    
    # Core facility search
    queries.append(f'"{company}" {region} cold storage square footage rooms capacity')
    
    # Industry directory search  
    queries.append(f'"{company}" site:refrigeratedfrozenfood.com OR site:gcca.org')
    
    # Company website search
    if facility['website'] and facility['website'] != 'N/A':
        queries.append(f'site:{facility["website"]} facility specifications')
    
    # Produce-specific search
    if facility['primary_produce']:
        queries.append(f'"{company}" {facility["primary_produce"]} packing storage')
    
    return queries

def main():
    print("Loading facilities CSV...")
    facilities_to_verify = load_facilities_csv()
    
    print(f"\nFound {len(facilities_to_verify)} facilities needing verification")
    print(f"Facilities marked 'Confirmed' that need deeper research\n")
    
    # Group by region for efficient research
    by_region = {}
    for fac in facilities_to_verify:
        region = fac['region']
        if region not in by_region:
            by_region[region] = []
        by_region[region].append(fac)
    
    print("Facilities by region:")
    for region, facs in sorted(by_region.items(), key=lambda x: len(x[1]), reverse=True):
        print(f"  {region}: {len(facs)} facilities")
    
    # Save initial analysis
    analysis = {
        'timestamp': datetime.now().isoformat(),
        'total_to_verify': len(facilities_to_verify),
        'by_region': {k: len(v) for k, v in by_region.items()},
        'facilities': facilities_to_verify[:50]  # First batch
    }
    
    with open('verification_analysis.json', 'w') as f:
        json.dump(analysis, f, indent=2)
    
    print(f"\nSaved analysis to verification_analysis.json")
    print(f"Ready to begin systematic verification")

if __name__ == '__main__':
    main()
