#!/usr/bin/env python3
"""
Parse NZ file using fixed-width column detection from header
"""

import re

def find_column_positions(header_line):
    """Find the start position of each column based on header using 2+ space gaps"""
    # Split header by 2+ spaces to identify columns
    import re
    parts = re.split(r'  +', header_line.rstrip())
    
    positions = []
    search_pos = 0
    
    for part in parts:
        if not part.strip():
            continue
        
        # Find where this column starts
        col_start = header_line.index(part, search_pos)
        positions.append((part.strip(), col_start))
        search_pos = col_start + len(part)
    
    return positions

def parse_nz_file_fixed_width(filepath):
    """Parse NZ file using fixed-width columns"""
    with open(filepath, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    
    # Get column positions from header (line 2)
    header_line = lines[1]
    col_positions = find_column_positions(header_line)
    
    # Add end positions
    columns = []
    for i, (name, start) in enumerate(col_positions):
        if i + 1 < len(col_positions):
            end = col_positions[i + 1][1]
        else:
            end = None  # Last column goes to end of line
        columns.append((name, start, end))
    
    print(f"Detected {len(columns)} columns:")
    for name, start, end in columns:
        print(f"  {name}: {start}-{end if end else 'EOL'}")
    
    # Parse data rows
    rows = []
    for line in lines[2:]:  # Skip first 2 header rows
        if not line.strip():
            continue
        
        row = {}
        for col_name, start, end in columns:
            if end:
                value = line[start:end].strip()
            else:
                value = line[start:].strip()
            row[col_name] = value
        
        rows.append(row)
    
    return rows, [col[0] for col in columns]

if __name__ == '__main__':
    rows, headers = parse_nz_file_fixed_width('/Users/max/.openclaw/workspace/postharvest/nz-sheet-data.tsv')
    
    print(f"\n\nParsed {len(rows)} rows")
    
    # Find kiwifruit companies
    kiwi_companies = []
    for row in rows:
        primary = row.get('Primary Fruit', '').lower()
        produce = row.get('Produce', '').lower()
        
        if 'kiwi' in primary or 'kiwi' in produce:
            kiwi_companies.append(row)
    
    print(f"\n\nKiwifruit companies: {len(kiwi_companies)}")
    for row in kiwi_companies:
        print(f"  {row.get('Company Name')}")
        print(f"    Primary Fruit: {row.get('Primary Fruit')}")
        print(f"    Produce: {row.get('Produce')}")
