#!/usr/bin/env python3
"""
Parse NZ sheet data file which has space-separated columns
"""

import csv
import re

def parse_nz_file(filepath):
    """Parse the NZ data file with space-separated columns"""
    with open(filepath, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    
    # Skip first row (grouped header), use second row for column names
    header_line = lines[1]
    
    # Find column positions by parsing the header
    # The header has column names separated by multiple spaces
    column_positions = []
    current_col = ''
    col_start = 0
    
    # Parse header to identify columns
    i = 0
    while i < len(header_line):
        if header_line[i] != ' ':
            if current_col == '' and column_positions:
                # Start of a new column
                col_start = i
            current_col += header_line[i]
        else:
            # Space encountered
            if current_col and i + 1 < len(header_line) and header_line[i + 1] == ' ':
                # Multiple spaces = end of column
                column_positions.append((current_col.strip(), col_start, i))
                current_col = ''
        i += 1
    
    # Add last column
    if current_col:
        column_positions.append((current_col.strip(), col_start, len(header_line)))
    
    # Alternative simpler approach: split by 2+ spaces
    # Extract header column names
    header_parts = re.split(r'  +', header_line.strip())
    header_parts = [h.strip() for h in header_parts if h.strip()]
    
    print(f"Detected {len(header_parts)} columns:")
    for i, col in enumerate(header_parts):
        print(f"  {i}: {col}")
    
    # Parse data rows
    rows = []
    for line in lines[2:]:  # Skip first 2 header rows
        if not line.strip():
            continue
        
        # Split by 2+ spaces
        parts = re.split(r'  +', line.strip())
        parts = [p.strip() for p in parts if p.strip()]
        
        # Create row dict
        row = {}
        for i, col_name in enumerate(header_parts):
            if i < len(parts):
                row[col_name] = parts[i]
            else:
                row[col_name] = ''
        
        rows.append(row)
    
    return rows, header_parts

if __name__ == '__main__':
    rows, headers = parse_nz_file('/Users/max/.openclaw/workspace/postharvest/nz-sheet-data.tsv')
    
    print(f"\n\nParsed {len(rows)} rows")
    print("\nFirst row:")
    for key, val in list(rows[0].items())[:5]:
        print(f"  {key}: {val}")
    
    # Find kiwifruit companies
    kiwi_count = 0
    print("\n\nKiwifruit companies:")
    for row in rows:
        # Check if any column contains kiwi
        for col_name in headers:
            if 'kiwi' in row.get(col_name, '').lower():
                print(f"  {row.get('Company Name', 'N/A')}: {row.get('Produce', 'N/A')}")
                kiwi_count += 1
                break
    
    print(f"\nTotal kiwifruit companies: {kiwi_count}")
