#!/usr/bin/env python3
"""
Enhanced Facility Research System
Uses web search and website scraping to gather detailed facility data
"""

import csv
import json
import re
import sys
from pathlib import Path
from dataclasses import dataclass, asdict
from typing import Dict, List, Optional

@dataclass
class FacilityResearchPlan:
    """Research plan for a single facility"""
    company: str
    region: str
    website: str
    search_queries: List[str] = None
    
    def __post_init__(self):
        if self.search_queries is None:
            self.search_queries = []
    
    def generate_queries(self):
        """Generate search queries for this facility"""
        base_queries = [
            f'"{self.company}" cold storage capacity rooms',
            f'"{self.company}" controlled atmosphere storage',
            f'"{self.company}" apple storage facility',
            f'"{self.company}" produce storage square feet',
            f'"{self.company}" GCCA IARW member'
        ]
        self.search_queries = base_queries
        return self.search_queries

class ResearchOrchestrator:
    """Coordinates facility research using external tools"""
    
    def __init__(self):
        self.research_queue = []
        self.completed = []
        
    def create_research_batch(self, facilities: List[Dict], count: int = 20) -> List[FacilityResearchPlan]:
        """Create research plans for a batch of facilities"""
        
        batch = []
        for i, facility in enumerate(facilities[:count]):
            plan = FacilityResearchPlan(
                company=facility.get('Company', ''),
                region=facility.get('Region', ''),
                website=facility.get('Website', '')
            )
            plan.generate_queries()
            batch.append(plan)
        
        return batch
    
    def generate_search_commands(self, batch: List[FacilityResearchPlan], 
                                 output_dir: Path) -> List[str]:
        """Generate shell commands to execute web searches"""
        
        commands = []
        for i, plan in enumerate(batch):
            # Create a JSON file with the search plan
            plan_file = output_dir / f'search-plan-{i:03d}.json'
            with open(plan_file, 'w') as f:
                json.dump({
                    'company': plan.company,
                    'region': plan.region,
                    'website': plan.website,
                    'queries': plan.search_queries
                }, f, indent=2)
            
            commands.append(f"# Research: {plan.company}")
            commands.append(f"echo 'Researching {plan.company}...'")
            
            # Add web search command (to be executed manually or via script)
            for j, query in enumerate(plan.search_queries):
                commands.append(f"# Query {j+1}: {query}")
        
        return commands
    
    def save_research_batch(self, batch: List[FacilityResearchPlan], output_path: Path):
        """Save research batch to JSON for processing"""
        
        batch_data = []
        for plan in batch:
            batch_data.append({
                'company': plan.company,
                'region': plan.region,
                'website': plan.website,
                'queries': plan.search_queries
            })
        
        with open(output_path, 'w') as f:
            json.dump(batch_data, f, indent=2)
        
        print(f"Saved research batch to {output_path}")
        return batch_data

def main():
    workspace = Path('/Users/max/.openclaw/workspace/postharvest')
    input_file = workspace / 'usa-cold-storage-master.csv'
    
    # Load facilities
    print(f"Loading facilities from {input_file}...")
    facilities = []
    with open(input_file, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        facilities = list(reader)
    
    print(f"Loaded {len(facilities)} facilities\n")
    
    # Prioritize facilities with websites
    facilities_with_web = [f for f in facilities if f.get('Website', '').strip()]
    facilities_no_web = [f for f in facilities if not f.get('Website', '').strip()]
    
    print(f"Facilities with websites: {len(facilities_with_web)}")
    print(f"Facilities without websites: {len(facilities_no_web)}\n")
    
    # Create research orchestrator
    orchestrator = ResearchOrchestrator()
    
    # Create batches for top priority facilities
    print("Creating research plans for top 30 facilities with websites...")
    batch1 = orchestrator.create_research_batch(facilities_with_web, count=30)
    
    # Save research batches
    batch1_path = workspace / 'research-batch-001.json'
    orchestrator.save_research_batch(batch1, batch1_path)
    
    print(f"\nResearch plan saved. Next steps:")
    print(f"1. Process batch file: {batch1_path}")
    print(f"2. For each facility, use web_search and web_fetch tools")
    print(f"3. Extract room counts, square footage, and produce types")
    print(f"4. Score and save results")
    
    # Also create a simple CSV template for manual entry
    template_path = workspace / 'verification-template.csv'
    with open(template_path, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow([
            'Company', 'Region', 'Website', 'Total Rooms', 'Square Footage',
            'Primary Produce', 'Premium Varieties', 'Organic', 'CA/MA',
            'Source URL', 'Notes'
        ])
        
        for plan in batch1:
            writer.writerow([
                plan.company, plan.region, plan.website, '', '', '', '', '', '', '', ''
            ])
    
    print(f"\nVerification template created: {template_path}")
    print("This can be used for manual data entry if needed.")

if __name__ == '__main__':
    main()
