#!/usr/bin/env python3
"""
Merge all regional cold storage CSVs into one master file
"""
import csv
import os

# All regional files
files = [
    'pacificnw-cold-storage.csv',
    'florida-cold-storage.csv',
    'california-cold-storage.csv',
    'greatlakes-cold-storage.csv',
    'national-operators-cold-storage.csv',
    'southwest-cold-storage.csv',
    'southeast-cold-storage.csv',
    'mountain-cold-storage.csv',
    'midwest-cold-storage.csv',
    'midatlantic-cold-storage.csv',
    'plains-cold-storage.csv',
    'newengland-cold-storage.csv',
    'remaining-states-cold-storage.csv'
]

base_path = '/Users/max/.openclaw/workspace/postharvest/'
all_rows = []
seen = set()  # Deduplicate

# Standard columns
header = ['Company', 'Website', 'Country', 'Region', 'Primary Fruit', 'Rooms', 'Address', 'Phone', 'Source']

for filename in files:
    filepath = os.path.join(base_path, filename)
    if not os.path.exists(filepath):
        print(f"⚠️  Skipping missing file: {filename}")
        continue
    
    print(f"📖 Reading {filename}...")
    with open(filepath, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        count = 0
        for row in reader:
            # Create unique key for deduplication
            company = row.get('Company', '').strip()
            region = row.get('Region', '').strip()
            key = f"{company}|{region}".lower()
            
            if key and key not in seen:
                seen.add(key)
                
                # Normalize to standard fields
                normalized = {
                    'Company': row.get('Company', ''),
                    'Website': row.get('Website', ''),
                    'Country': row.get('Country', 'USA'),
                    'Region': row.get('Region', ''),
                    'Primary Fruit': row.get('Primary Fruit', ''),
                    'Rooms': row.get('Rooms', ''),
                    'Address': row.get('Address', ''),
                    'Phone': row.get('Phone', ''),
                    'Source': row.get('Source', '')
                }
                
                all_rows.append(normalized)
                count += 1
        print(f"   ✅ Added {count} unique facilities")

print(f"\n📊 Total unique facilities: {len(all_rows)}")

# Write merged file
output_file = os.path.join(base_path, 'usa-cold-storage-master.csv')
print(f"\n💾 Writing to {output_file}...")

with open(output_file, 'w', newline='', encoding='utf-8') as f:
    writer = csv.DictWriter(f, fieldnames=header)
    writer.writeheader()
    writer.writerows(all_rows)

print(f"✅ Saved {len(all_rows)} facilities to usa-cold-storage-master.csv")
print(f"\nTop 20 states by facility count:")

# Quick stats
states = {}
for row in all_rows:
    region = row.get('Region', 'Unknown')
    # Extract state from region (usually "City, ST" format)
    if ',' in region:
        state = region.split(',')[-1].strip()
    else:
        state = region
    states[state] = states.get(state, 0) + 1

for state in sorted(states.keys(), key=lambda x: states[x], reverse=True)[:20]:
    print(f"  {state}: {states[state]}")
