#!/usr/bin/env python3
import json
import subprocess
import time
import re

# Load NZ companies
with open('/Users/max/.openclaw/workspace/postharvest/nz-companies.json', 'r') as f:
    data = json.load(f)
    companies = data['values']

header = companies[0]
missing = []

for row in companies[1:]:
    if len(row) > 1 and not (row[1] or '').strip():
        missing.append(row)

print(f"Found {len(missing)} companies missing websites\n")

results = []

for row in missing[:5]:  # Start with first 5 to avoid rate limits
    company_name = row[0]
    location = row[3] if len(row) > 3 else ''
    
    print(f"Searching: {company_name} ({location})")
    
    # Search for the company
    query = f"{company_name} New Zealand {location} cold storage"
    
    result = subprocess.run([
        'openclaw', 'web-search', query, '--count', '3'
    ], capture_output=True, text=True)
    
    if result.returncode == 0:
        # Parse results and try to find website
        lines = result.stdout.strip().split('\n')
        potential_url = None
        
        for line in lines:
            # Look for URL patterns
            if 'http' in line.lower():
                # Extract URL
                url_match = re.search(r'https?://[^\s\)]+', line)
                if url_match:
                    potential_url = url_match.group(0)
                    # Clean up URL
                    potential_url = potential_url.rstrip('.,;')
                    break
        
        if potential_url:
            print(f"  ✅ Found: {potential_url}")
            results.append({
                'company': company_name,
                'location': location,
                'website': potential_url,
                'row_index': companies.index(row)
            })
        else:
            print(f"  ❌ No website found")
            results.append({
                'company': company_name,
                'location': location,
                'website': '',
                'row_index': companies.index(row)
            })
    
    time.sleep(1)  # Rate limiting

# Save results
with open('/Users/max/.openclaw/workspace/postharvest/nz-website-findings.json', 'w') as f:
    json.dump(results, f, indent=2)

print(f"\n✅ Saved findings to nz-website-findings.json")
print(f"Found websites for {len([r for r in results if r['website']])} companies")
