#!/usr/bin/env python3
import csv
import json

# Load desktop CSV (source of truth - 2,180 rows)
desktop = []
with open('/Users/max/Desktop/PHT-Master-List.csv', 'r', encoding='utf-8') as f:
    reader = csv.reader(f)
    desktop = list(reader)

print(f"Desktop CSV: {len(desktop)} rows (including header)")

# Load Google Sheet data (1,993 rows)
with open('/Users/max/.openclaw/workspace/postharvest/google-master-list.json', 'r') as f:
    google_data = json.load(f)
    google_rows = google_data['values']

print(f"Google Sheet: {len(google_rows)} rows (including header)")

# Create sets of company identifiers (name + website) for comparison
def make_key(row):
    # Use company name + website as unique key
    if len(row) >= 2:
        name = (row[0] or '').strip().lower()
        website = (row[1] or '').strip().lower().replace('https://', '').replace('http://', '')
        return f"{name}|{website}"
    return None

desktop_keys = set()
for row in desktop[1:]:  # Skip header
    key = make_key(row)
    if key:
        desktop_keys.add(key)

google_keys = set()
for row in google_rows[1:]:  # Skip header
    key = make_key(row)
    if key:
        google_keys.add(key)

# Find missing companies (in desktop but not in Google)
missing_keys = desktop_keys - google_keys
print(f"\nMissing in Google Sheet: {len(missing_keys)} companies")

# Find the actual rows
missing_rows = []
for row in desktop[1:]:
    key = make_key(row)
    if key in missing_keys:
        missing_rows.append(row)

# Save missing rows
if missing_rows:
    # Add header
    output_data = [desktop[0]] + missing_rows
    
    with open('/Users/max/.openclaw/workspace/postharvest/missing-companies.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerows(output_data)
    
    # Also save as JSON for gog
    with open('/Users/max/.openclaw/workspace/postharvest/missing-companies.json', 'w') as f:
        json.dump(output_data, f)
    
    print(f"✅ Saved {len(missing_rows)} missing companies to missing-companies.csv/json")
    print(f"\nSample of missing companies:")
    for row in missing_rows[:5]:
        print(f"  - {row[0]} ({row[2]})")
else:
    print("✅ No missing companies - Google Sheet is up to date!")

# Also check: any in Google but NOT in desktop? (shouldn't happen but worth checking)
extra_keys = google_keys - desktop_keys
if extra_keys:
    print(f"\n⚠️  Warning: {len(extra_keys)} companies in Google but NOT on desktop")
