local-ocr/ktp_extractor.py

"""
KTP Field Extractor
Ekstraksi data terstruktur dari hasil OCR KTP Indonesia
Mendukung berbagai format output OCR (full-width colon, standard colon, tanpa colon)

OPTIMIZED: Pre-compiled regex patterns for better performance
"""

import re
from typing import Dict, Optional, List
import difflib

# Debug mode - set to False for production
DEBUG_MODE = False

class KTPExtractor:
    """Ekstrak field dari hasil OCR KTP"""

    # Pre-compiled regex patterns (optimization)
    COLON_PATTERN = re.compile(r'[：:]')
    NIK_PATTERN = re.compile(r'\b(\d{16})\b')
    DATE_PATTERN = re.compile(r'(\d{2}[-/]\d{2}[-/]\d{4})')
    RT_RW_PATTERN = re.compile(r'(\d{3})\s*/\s*(\d{3})')
    GOL_DARAH_PATTERN = re.compile(r'([ABO]{1,2}[+\-]?)', re.IGNORECASE)
    PROVINSI_SPLIT_PATTERN = re.compile(r'(?i)provinsi\s*')
    KABUPATEN_SPLIT_PATTERN = re.compile(r'(?i)\s*(kabupaten|kota)\s*')
    TTL_PATTERN = re.compile(r'(?i)tempat[/\s]*tgl[/\s]*lahir|tempat[/\s]*lahir|lahir')

    # Pattern colon string (for backward compatibility)
    COLON_PATTERN_STR = r'[：:]'

    # Daftar Provinsi Indonesia (38 Provinsi)
    PROVINSI_LIST = [
        "ACEH", "SUMATERA UTARA", "SUMATERA BARAT", "RIAU", "JAMBI", "SUMATERA SELATAN", "BENGKULU", "LAMPUNG",
        "KEPULAUAN BANGKA BELITUNG", "KEPULAUAN RIAU", "DKI JAKARTA", "JAWA BARAT", "JAWA TENGAH", "DI YOGYAKARTA",
        "JAWA TIMUR", "BANTEN", "BALI", "NUSA TENGGARA BARAT", "NUSA TENGGARA TIMUR", "KALIMANTAN BARAT",
        "KALIMANTAN TENGAH", "KALIMANTAN SELATAN", "KALIMANTAN TIMUR", "KALIMANTAN UTARA", "SULAWESI UTARA",
        "SULAWESI TENGAH", "SULAWESI SELATAN", "SULAWESI TENGGARA", "GORONTALO", "SULAWESI BARAT", "MALUKU",
        "MALUKU UTARA", "PAPUA BARAT", "PAPUA", "PAPUA SELATAN", "PAPUA TENGAH", "PAPUA PEGUNUNGAN", "PAPUA BARAT DAYA"
    ]

    # Keywords untuk jenis kelamin
    MALE_KEYWORDS = ['laki', 'pria', 'male']
    FEMALE_KEYWORDS = ['perempuan', 'wanita', 'female']

    # Agama yang valid
    AGAMA_LIST = ['islam', 'kristen', 'katolik', 'hindu', 'budha', 'buddha', 'konghucu']

    # Pekerjaan umum
    PEKERJAAN_LIST = ['pelajar', 'mahasiswa', 'pegawai', 'swasta', 'pns', 'wiraswasta',
                      'buruh', 'petani', 'nelayan', 'karyawan', 'ibu rumah tangga',
                      'tidak bekerja', 'lainnya', 'mengurus rumah tangga']

    # Status Perkawinan yang valid
    STATUS_PERKAWINAN_LIST = ['BELUM KAWIN', 'KAWIN', 'CERAI HIDUP', 'CERAI MATI']

    # Field Labels untuk fuzzy matching (mengatasi typo OCR seperti "Aamat" -> "ALAMAT")
    FIELD_LABELS = {
        'nama': ['NAMA'],
        'alamat': ['ALAMAT'],
        'agama': ['AGAMA'],
        'pekerjaan': ['PEKERJAAN'],
        'kewarganegaraan': ['KEWARGANEGARAAN', 'WARGANEGARA'],
        'tempat_lahir': ['TEMPAT', 'LAHIR', 'TEMPAT/TGL LAHIR'],
        'jenis_kelamin': ['JENIS KELAMIN', 'JENIS', 'KELAMIN'],
        'gol_darah': ['GOL. DARAH', 'GOL DARAH', 'GOLONGAN DARAH'],
        'kel_desa': ['KEL/DESA', 'KELURAHAN', 'DESA'],
        'kecamatan': ['KECAMATAN', 'KEC'],
        'status_perkawinan': ['STATUS PERKAWINAN', 'PERKAWINAN'],
        'berlaku_hingga': ['BERLAKU HINGGA', 'BERLAKU'],
        'rt_rw': ['RT/RW', 'RT', 'RW'],
    }

    # ============================================
    # Sistem Penamaan Hindu Bali
    # ============================================
    # Struktur: [Prefix Gender] + [Gelar Kasta] + [Penanda Gender] + [Urutan Lahir] + [Nama Pribadi]

    # Prefix penanda gender (harus di awal nama)
    BALI_GENDER_PREFIX = {
        'NI': 'PEREMPUAN',      # Prefix untuk perempuan
        'I': 'LAKI-LAKI',       # Prefix untuk laki-laki
    }

    # Gelar Kasta (setelah prefix gender)
    BALI_KASTA = {
        'IDA': 'BRAHMANA',
        'GUSTI': 'KSATRIA',
        'ANAK AGUNG': 'KSATRIA',
        'COKORDA': 'KSATRIA',
        'DEWA': 'KSATRIA',
        'DESAK': 'KSATRIA',
        'AGUNG': 'KSATRIA',
        'NGAKAN': 'WAISYA',
        'SANG': 'WAISYA',
        'SI': 'WAISYA',
    }

    # Penanda gender tambahan (setelah kasta)
    BALI_GENDER_MARKER = {
        'AYU': 'PEREMPUAN',
        'ISTRI': 'PEREMPUAN',
        'LUH': 'PEREMPUAN',
        'BAGUS': 'LAKI-LAKI',
        'GEDE': 'LAKI-LAKI',
        'AGUS': 'LAKI-LAKI',
        'ALIT': 'LAKI-LAKI',    # Kecil/muda (untuk laki-laki)
    }

    # Urutan kelahiran (bersiklus setiap 4 anak)
    BALI_BIRTH_ORDER = {
        'PUTU': 1, 'WAYAN': 1, 'GEDE': 1, 'ILUH': 1,
        'MADE': 2, 'KADEK': 2, 'NENGAH': 2,
        'NYOMAN': 3, 'KOMANG': 3,
        'KETUT': 4,
        'BALIK': 5,  # Untuk anak ke-5+ (siklus ulang)
    }

    # Soroh/Klan Bali (identifikasi garis keturunan)
    BALI_SOROH = {
        'PASEK': 'SOROH',       # Klan mayoritas (~60% Hindu Bali)
        'PANDE': 'SOROH',       # Klan pandai besi/metalurgi
        'ARYA': 'SOROH',        # Klan Arya
        'BENDESA': 'SOROH',     # Pemimpin adat
        'TANGKAS': 'SOROH',     # Klan Tangkas
        'CELAGI': 'SOROH',      # Klan Celagi
        'SENGGUHU': 'SOROH',    # Klan Sengguhu
        'KUBAYAN': 'SOROH',     # Klan Kubayan
        'BANDESA': 'SOROH',     # Varian Bendesa
    }

    # Gabungkan semua komponen untuk deteksi (urut dari panjang ke pendek)
    BALI_NAME_COMPONENTS = [
        # Prefix gender
        'NI', 'I',
        # Kasta (prioritas: yang lebih panjang dulu)
        'ANAK AGUNG', 'COKORDA', 'NGAKAN',
        'IDA', 'GUSTI', 'DEWA', 'DESAK', 'AGUNG', 'SANG', 'SI',
        # Soroh/Klan
        'PASEK', 'PANDE', 'ARYA', 'BENDESA', 'BANDESA', 'TANGKAS', 'CELAGI', 'SENGGUHU', 'KUBAYAN',
        # Gender marker
        'AYU', 'ISTRI', 'LUH', 'BAGUS', 'GEDE', 'AGUS', 'ALIT',
        # Urutan lahir
        'WAYAN', 'PUTU', 'ILUH', 'MADE', 'KADEK', 'NENGAH', 'NYOMAN', 'KOMANG', 'KETUT', 'BALIK',
    ]

    # KTP Zone Template (normalized coordinates: x_min, y_min, x_max, y_max)
    # Based on standard KTP layout
    ZONES = {
        'header_provinsi':  (0.15, 0.00, 0.85, 0.07),  # PROVINSI header
        'header_kabupaten': (0.15, 0.05, 0.85, 0.13),  # KABUPATEN header
        'nik':              (0.02, 0.10, 0.70, 0.22),  # NIK area
        'nama':             (0.02, 0.18, 0.70, 0.28),  # Nama area
        'ttl':              (0.02, 0.25, 0.70, 0.36),  # Tempat/Tgl Lahir
        'jenis_kelamin':    (0.02, 0.33, 0.45, 0.42),  # Jenis Kelamin (left)
        'gol_darah':        (0.40, 0.33, 0.70, 0.42),  # Gol Darah (right of jenis)
        'alamat':           (0.02, 0.38, 0.70, 0.50),  # Alamat
        'rt_rw':            (0.02, 0.46, 0.70, 0.54),  # RT/RW
        'kel_desa':         (0.02, 0.51, 0.70, 0.60),  # Kel/Desa
        'kecamatan':        (0.02, 0.57, 0.70, 0.66),  # Kecamatan
        'agama':            (0.02, 0.63, 0.70, 0.72),  # Agama
        'status':           (0.02, 0.69, 0.70, 0.78),  # Status Perkawinan
        'pekerjaan':        (0.02, 0.75, 0.70, 0.84),  # Pekerjaan
        'wni':              (0.02, 0.81, 0.70, 0.90),  # Kewarganegaraan
        'berlaku':          (0.02, 0.87, 0.70, 0.96),  # Berlaku Hingga
        'foto':             (0.68, 0.10, 0.98, 0.55),  # Foto (right side)
        'penerbitan':       (0.65, 0.58, 0.98, 0.98),  # Tempat & Tanggal penerbitan
    }

    def __init__(self):
        self.image_width = 0
        self.image_height = 0

    def _get_zone(self, x_center: float, y_center: float, img_width: int, img_height: int) -> Optional[str]:
        """Determine which zone a text belongs to based on normalized coordinates"""
        if img_width == 0 or img_height == 0:
            return None

        # Normalize coordinates
        x_norm = x_center / img_width
        y_norm = y_center / img_height

        for zone_name, (x_min, y_min, x_max, y_max) in self.ZONES.items():
            if x_min <= x_norm <= x_max and y_min <= y_norm <= y_max:
                return zone_name
        return None

    def _extract_value_from_text(self, text: str) -> str:
        """Extract value part from label:value text"""
        # Split by colon (standard or full-width)
        parts = re.split(r'[：:]', text, 1)
        if len(parts) > 1:
            return parts[1].strip()
        return text.strip()

    def _find_best_match(self, text: str, candidates: List[str], cutoff: float = 0.6) -> Optional[str]:
        """Find best fuzzy match from candidates"""
        matches = difflib.get_close_matches(text, candidates, n=1, cutoff=cutoff)
        return matches[0] if matches else None

    def _is_label_match(self, text: str, field_name: str, cutoff: float = 0.7) -> bool:
        """
        Fuzzy match untuk label field - mengatasi typo OCR seperti "Aamat" -> "ALAMAT"
        Returns True jika text cocok dengan salah satu label untuk field tersebut
        """
        if not text or not text.strip():
            return False

        if field_name not in self.FIELD_LABELS:
            return field_name.lower() in text.lower()

        text_upper = text.upper().strip()

        # Explicit conflict prevention
        if field_name == 'agama' and 'ALAMAT' in text_upper:
            return False
        if field_name == 'alamat' and 'AGAMA' in text_upper:
            return False

        # Coba exact match dulu (lebih cepat)
        for label in self.FIELD_LABELS[field_name]:
            if label in text_upper:
                return True

        # Fuzzy match jika tidak ada exact match
        # Ekstrak kata pertama dari text (biasanya label ada di awal)
        parts = text_upper.split(':')[0].split()
        if not parts:
            return False
        first_word = parts[0]

        for label in self.FIELD_LABELS[field_name]:
            label_parts = label.split()
            if not label_parts:
                continue
            # Bandingkan dengan kata pertama
            ratio = difflib.SequenceMatcher(None, first_word, label_parts[0]).ratio()

            # Dynamic cutoff logic
            effective_cutoff = cutoff
            if len(first_word) < 7:
                # Use stricter cutoff for short words to prevent ALAMAT (6) matching AGAMA (5) -> ratio 0.73
                effective_cutoff = max(cutoff, 0.82)

            if ratio >= effective_cutoff:
                if DEBUG_MODE:
                    print(f"  [FUZZY LABEL] '{first_word}' matched '{label}' (ratio={ratio:.2f})")
                return True

        return False

    def _extract_after_label(self, text: str, label_pattern: str) -> Optional[str]:
        """
        Extract content after a label (fuzzy/regex match).
        Handles cases with/without colons.
        """
        if not text: return None

        # 1. Try Regex Search if pattern provided
        if label_pattern:
            # Construct regex: Label + optional spaces/colon + (Group 1: Value)
            # flags=re.IGNORECASE should be used
            # We want to find the END of the label
            match = re.search(f"({label_pattern})[:\\s]*", text, re.IGNORECASE)
            if match:
                # Return everything after the match end
                return text[match.end():].strip()

        return None

    def _parse_balinese_name(self, name: str) -> str:
        """
        Parse nama Bali yang digabung OCR dan tambahkan spasi yang tepat.
        Contoh: "NIGUSTIAYUNYOMANSUWETRI" -> "NI GUSTI AYU NYOMAN SUWETRI"

        Struktur nama Bali:
        [Prefix Gender] + [Gelar Kasta] + [Penanda Gender] + [Urutan Lahir] + [Nama Pribadi]

        PENTING: Hanya proses jika nama benar-benar mengandung komponen Bali!
        """
        if not name:
            return name

        name_upper = name.upper().strip()

        # Jika sudah ada spasi dengan jumlah wajar, kembalikan apa adanya
        if name_upper.count(' ') >= 2:
            return name_upper

        # Cek apakah nama mengandung komponen Bali
        # Nama harus dimulai dengan NI, I GUSTI, IDA, atau komponen urutan lahir Bali
        name_clean = name_upper.replace(' ', '')

        is_balinese_name = False
        # Cek prefix khas Bali
        if name_clean.startswith('NI') and len(name_clean) > 3:
            # NI harus diikuti komponen Bali lain (GUSTI, LUH, WAYAN, dll)
            after_ni = name_clean[2:]
            for comp in ['GUSTI', 'LUH', 'WAYAN', 'MADE', 'NYOMAN', 'KETUT', 'PUTU', 'KADEK', 'KOMANG', 'PASEK', 'PANDE']:
                if after_ni.startswith(comp):
                    is_balinese_name = True
                    break
        elif name_clean.startswith('IGUSTI') or name_clean.startswith('IDABAGUS') or name_clean.startswith('IDAAYU'):
            is_balinese_name = True
        elif any(name_clean.startswith(p) for p in ['GUSTI', 'WAYAN', 'PUTU', 'MADE', 'KADEK', 'NYOMAN', 'KOMANG', 'KETUT']):
            is_balinese_name = True

        if not is_balinese_name:
            # Bukan nama Bali, kembalikan dengan pemisahan spasi standar
            # Jika ada 1 spasi, kembalikan apa adanya
            if ' ' in name_upper:
                return name_upper
            # Jika tidak ada spasi sama sekali, kembalikan apa adanya (mungkin memang 1 kata)
            return name_upper

        # Urutan komponen yang akan dicari (dari yang terpanjang ke terpendek untuk akurasi)
        components_ordered = sorted(self.BALI_NAME_COMPONENTS, key=len, reverse=True)

        result_parts = []
        remaining = name_clean

        # Parse prefix gender (NI atau I di awal)
        if remaining.startswith('NI'):
            result_parts.append('NI')
            remaining = remaining[2:]
        elif remaining.startswith('I') and len(remaining) > 1:
            # Pastikan bukan bagian dari kata lain
            next_char = remaining[1] if len(remaining) > 1 else ''
            # Cek apakah karakter setelah I adalah konsonan (bukan vokal)
            if next_char not in 'AIUEO':
                result_parts.append('I')
                remaining = remaining[1:]

        # Parse komponen-komponen lainnya
        found = True
        max_iterations = 10  # Prevent infinite loop
        iteration = 0

        while remaining and found and iteration < max_iterations:
            found = False
            iteration += 1

            for component in components_ordered:
                if remaining.startswith(component):
                    # Skip jika komponen sudah ada di result (kecuali nama pribadi)
                    if component not in result_parts or component not in self.BALI_NAME_COMPONENTS:
                        result_parts.append(component)
                        remaining = remaining[len(component):]
                        found = True
                        break

        # Sisa adalah nama pribadi
        if remaining:
            result_parts.append(remaining)

        parsed_name = ' '.join(result_parts)

        # Log jika ada perubahan
        if parsed_name != name_upper:
            print(f"  [BALI NAME] '{name_upper}' -> '{parsed_name}'")

        return parsed_name

    def _search_best_match_in_text(self, text: str, candidates: List[str], prefix: str = "") -> tuple:
        """
        Search if any candidate is present in text using multiple strategies:
        1. Exact substring
        2. Prefix + Candidate (Fuzzy) - e.g. "PROVINSI BALI"
        3. Candidate Only (Fuzzy) - e.g. "BALI" (if prefix is missing/damaged)
        Returns (best_candidate, confidence_score)
        """
        text_upper = text.upper()
        best_match = None
        best_ratio = 0.0

        # Strategy 1: Exact substring match (fastest & most reliable)
        for candidate in candidates:
            if candidate in text_upper:
                if len(candidate) > len(best_match or ""):
                    best_match = candidate
                    best_ratio = 1.0

        if best_ratio == 1.0:
            return best_match, best_ratio

        # Strategy 2: Prefix Construction & Fuzzy Match
        prefix_upper = prefix.upper() if prefix else ""

        # DEBUG: Print checking (controlled by DEBUG_MODE)
        if DEBUG_MODE:
            print(f"DEBUG Check Text: '{text_upper}' with Prefix: '{prefix_upper}'")

        for candidate in candidates:
            # 2a. Compare with Prefix + Space (e.g. "PROVINSI BALI")
            if prefix:
                target_spaced = f"{prefix_upper} {candidate}"
                s_spaced = difflib.SequenceMatcher(None, target_spaced, text_upper)
                ratio_spaced = s_spaced.ratio()

                # print(f"  -> Compare '{target_spaced}' vs '{text_upper}' = {ratio_spaced:.2f}")

                if ratio_spaced > best_ratio and ratio_spaced > 0.5:
                    best_ratio = ratio_spaced
                    best_match = candidate

                # 2b. Compare with Prefix NO SPACE (e.g. "PROVINSIBALI")
                # This handles "PROVNSIBALI" perfectly
                target_merged = f"{prefix_upper}{candidate}"
                s_merged = difflib.SequenceMatcher(None, target_merged, text_upper)
                ratio_merged = s_merged.ratio()

                if DEBUG_MODE:
                    print(f"  -> Compare Merged '{target_merged}' vs '{text_upper}' = {ratio_merged:.2f}")

                if ratio_merged > best_ratio and ratio_merged > 0.5:
                    best_ratio = ratio_merged
                    best_match = candidate

            # 2c. Compare Candidate ONLY (e.g. "BALI")
            if len(candidate) > 3:
                 s_raw = difflib.SequenceMatcher(None, candidate, text_upper)
                 ratio_raw = s_raw.ratio()

                 # print(f"  -> Compare Raw '{candidate}' vs '{text_upper}' = {ratio_raw:.2f}")

                 if ratio_raw > best_ratio and ratio_raw > 0.6:
                     best_ratio = ratio_raw
                     best_match = candidate

        if DEBUG_MODE:
            print(f"DEBUG Best Match: {best_match} ({best_ratio:.2f})")
        return best_match, best_ratio

    def _detect_image_size(self, ocr_results: List[Dict]) -> tuple:
        """Detect image dimensions from bounding boxes"""
        max_x, max_y = 0, 0
        for r in ocr_results:
            bbox = r.get('bbox', [])
            if bbox and len(bbox) >= 4:
                for point in bbox:
                    if len(point) >= 2:
                        max_x = max(max_x, point[0])
                        max_y = max(max_y, point[1])
        # Add some margin
        return (int(max_x * 1.05), int(max_y * 1.05)) if max_x > 0 else (1000, 640)

    def _extract_by_zones(self, zone_texts: Dict[str, List[str]], result: Dict):
        """Extract fields based on zone assignments"""

        # PROVINSI from header
        if 'header_provinsi' in zone_texts:
            print(f"DEBUG Zone Provinsi Content: {zone_texts['header_provinsi']}")
            for text in zone_texts['header_provinsi']:
                text_clean = text.strip()
                # Use prefix strategy: "PROVINSI " + result vs text
                match, score = self._search_best_match_in_text(text_clean, self.PROVINSI_LIST, prefix="PROVINSI")

                # LOWER THRESHOLD to 0.5 because "PROVINSI BALI" vs "PROVNSIBALI" is roughly 0.5-0.6 range
                if match and score > 0.5:
                    result['provinsi'] = match

                    # Remove the found province (and label) from text to see what's left
                    # If we matched "PROVINSI JAWA TIMUR", the text might be "PROVNSIJAWATMRKABUPATENSUMENEP"
                    # It's hard to cleanly remove "PROVISI JAWA TIMUR" if it was fuzzy matched.

                    # BUT, we can try to find "KABUPATEN" or "KOTA" in the original text
                    # independent of the province match
                    if 'kabupaten' in text_clean.lower() or 'kota' in text_clean.lower():
                        parts = re.split(r'(?i)\s*(kabupaten|kota)', text_clean)
                        if len(parts) > 1:
                            kab_part = "".join(parts[1:]).strip()
                            kab_val = re.sub(r'^(?i)(kabupaten|kota)\s*', '', kab_part).strip()
                            if kab_val and result['kabupaten_kota'] is None:
                                prefix = "KABUPATEN" if "kabupaten" in text_clean.lower() else "KOTA"
                                result['kabupaten_kota'] = f"{prefix} {kab_val.upper()}"
                    break

                # Fallback to keyword splitting (Legacy/Blurry fallback)
                text_lower = text.lower()
                val = text

                # If keyword exists, strip it
                if 'provinsi' in text_lower:
                    split_prov = re.split(r'(?i)provinsi\s*', text, 1)
                    if len(split_prov) > 1:
                        val = split_prov[1].strip()
                    else:
                        val = ""

                # Check for merged text
                if 'kabupaten' in text_lower or 'kota' in text_lower:
                    parts = re.split(r'(?i)\s*(kabupaten|kota)', val)
                    val = parts[0].strip()

                    if len(parts) > 1:
                        kab_part = "".join(parts[1:]).strip()
                        kab_val = re.sub(r'^(?i)(kabupaten|kota)\s*', '', kab_part).strip()
                        if kab_val and result['kabupaten_kota'] is None:
                            prefix = "KABUPATEN" if "kabupaten" in text_lower else "KOTA"
                            result['kabupaten_kota'] = f"{prefix} {kab_val.upper()}"

                if val and len(val) > 2:
                    # Try fuzzy match again on the cleaned value
                    best_match = self._find_best_match(val.upper(), self.PROVINSI_LIST, cutoff=0.6)
                    if best_match:
                        result['provinsi'] = best_match
                    else:
                        result['provinsi'] = val.upper()
                    break

        # KABUPATEN/KOTA from header
        if 'header_kabupaten' in zone_texts:
            for text in zone_texts['header_kabupaten']:
                text_lower = text.lower()
                val = text

                # Check keyword
                if 'kabupaten' in text_lower or 'kota' in text_lower:
                    split_kab = re.split(r'(?i)\s*(kabupaten|kota)\s*', text, 1)
                    if len(split_kab) > 1:
                         val = split_kab[-1].strip()
                    else:
                         val = ""

                # If no keyword, but it's in the kabupaten zone, assume it's data
                if val:
                     # Re-add prefix standard if we separated it or if it was missing
                     # Heuristic: if validation suggests it's a known regency, we are good.
                     # For now, standardize format.
                     if result['kabupaten_kota'] is None:
                         prefix = "KABUPATEN" if "kabupaten" in text_lower else "KOTA"
                         # If no keyword found, default to KABUPATEN? Or better check Wilayah?
                         # Let's default to detected keyword or KABUPATEN
                         if "kota" in text_lower:
                             prefix = "KOTA"
                         else:
                             prefix = "KABUPATEN"

                         result['kabupaten_kota'] = f"{prefix} {val.upper()}"
                     break

        # NAMA from nama zone (skip label line)
        if 'nama' in zone_texts:
            for text in zone_texts['nama']:
                text_lower = text.lower()
                if 'nama' not in text_lower and len(text) > 2:
                    result['nama'] = text.upper()
                    break
                elif 'nama' in text_lower:
                    val = self._extract_value_from_text(text)
                    if val and 'nama' not in val.lower():
                        result['nama'] = val.upper()

        # TTL from ttl zone
        if 'ttl' in zone_texts:
            for text in zone_texts['ttl']:
                # Skip if text is JUST the label (length check or fuzzy match)
                if len(text) < 15 and self._is_label_match(text, 'tempat_lahir'):
                     continue

                if 'tempat' in text.lower() or 'lahir' in text.lower() or 'tgl' in text.lower() or len(text) > 5:
                    val = self._extract_value_from_text(text)
                    if val:
                        # Don't accept if val looks like label
                        if self._is_label_match(val, 'tempat_lahir') and len(val) < 20:
                             continue

                        self._parse_ttl(val, result)
                        # Only break if we actually got a birth date, otherwise keep looking
                        if result['tanggal_lahir']:
                            break

        # JENIS KELAMIN
        if 'jenis_kelamin' in zone_texts:
            for text in zone_texts['jenis_kelamin']:
                text_lower = text.lower()
                if 'laki' in text_lower:
                    result['jenis_kelamin'] = 'LAKI-LAKI'
                    break
                elif 'perempuan' in text_lower or 'wanita' in text_lower:
                    result['jenis_kelamin'] = 'PEREMPUAN'
                    break

        # GOL DARAH
        if 'gol_darah' in zone_texts:
            for text in zone_texts['gol_darah']:
                gol_match = re.search(r'([ABO]{1,2}[+\-]?)', text, re.IGNORECASE)
                if gol_match:
                    result['gol_darah'] = gol_match.group(1).upper()
                    break

        # ALAMAT
        if 'alamat' in zone_texts:
            for text in zone_texts['alamat']:
                if 'alamat' not in text.lower() or len(zone_texts['alamat']) == 1:
                    val = self._extract_value_from_text(text) if 'alamat' in text.lower() else text
                    if val and 'alamat' not in val.lower():
                        result['alamat'] = val.upper()
                        break

        # RT/RW
        if 'rt_rw' in zone_texts:
            for text in zone_texts['rt_rw']:
                rt_rw_match = re.search(r'(\d{3})\s*/\s*(\d{3})', text)
                if rt_rw_match:
                    result['rt_rw'] = f"{rt_rw_match.group(1)}/{rt_rw_match.group(2)}"
                    break

        # KEL/DESA
        if 'kel_desa' in zone_texts:
            for text in zone_texts['kel_desa']:
                if 'kel' in text.lower() or 'desa' in text.lower():
                    val = self._extract_value_from_text(text)
                    if val and 'kel' not in val.lower():
                        result['kel_desa'] = val.upper()
                        break
                elif result['kel_desa'] is None:
                    # Fallback context: simple text
                     result['kel_desa'] = text.upper()

        # KECAMATAN
        if 'kecamatan' in zone_texts:
            for text in zone_texts['kecamatan']:
                if 'kec' in text.lower():
                    val = self._extract_value_from_text(text)
                    if val and 'kec' not in val.lower():
                        result['kecamatan'] = val.upper()
                        break
                elif result['kecamatan'] is None:
                     result['kecamatan'] = text.upper()

        # AGAMA
        if 'agama' in zone_texts:
            for text in zone_texts['agama']:
                val = text.upper()
                if 'agama' in text.lower():
                    val = self._extract_value_from_text(text).upper()

                # Verify against valid list
                for agama in self.AGAMA_LIST:
                    if agama.upper() in val:
                        result['agama'] = agama.upper()
                        break
                if result['agama']: break

        # STATUS PERKAWINAN
        if 'status' in zone_texts:
            for text in zone_texts['status']:
                val = text.upper()
                # Normalize common OCR errors (e.g. BELUMKAWIN)
                val = val.replace("BELUMKAWIN", "BELUM KAWIN")

                # Check against official list
                found_status = False
                for status in self.STATUS_PERKAWINAN_LIST:
                    if status in val:
                        result['status_perkawinan'] = status
                        found_status = True
                        break
                if found_status: break

        # PEKERJAAN
        if 'pekerjaan' in zone_texts:
            best_job = None
            potential_job = None

            for text in zone_texts['pekerjaan']:
                val = text.upper()
                if 'pekerjaan' in text.lower():
                    val = self._extract_value_from_text(text).upper()

                # Clean up
                val = val.strip()
                if not val or len(val) < 3 or 'PEKERJAAN' in val:
                    continue

                # 1. Check against wildcard/list (Priority)
                # Buruh, Karyawan, Pelajar, dll
                if any(job.upper() in val for job in self.PEKERJAAN_LIST):
                    best_job = val
                    break # Found a definitive job

                # 2. Save as potential if it's NOT a known bad value (like City names)
                # Avoid capturing 'TABANAN', 'JAKARTA', date strings
                if not any(city in val for city in ['KABUPATEN', 'KOTA', 'TABANAN', 'BADUNG', 'DENPASAR', 'JAKARTA', 'BANDUNG']):
                     if not re.search(r'\d{2}-\d{2}-\d{4}', val): # Avoid dates
                         if potential_job is None:
                             potential_job = val

            if best_job:
                result['pekerjaan'] = best_job
            elif potential_job:
                result['pekerjaan'] = potential_job

        # WNI
        if 'wni' in zone_texts:
             for text in zone_texts['wni']:
                 if 'wni' in text.lower():
                     result['kewarganegaraan'] = 'WNI'
                     break
                 elif 'wna' in text.lower():
                     result['kewarganegaraan'] = 'WNA'
                     break

        # PENERBITAN area (tempat & tanggal dalam satu zona)
        if 'penerbitan' in zone_texts:
            for text in zone_texts['penerbitan']:
                # Look for date
                date_match = re.search(r'(\d{2}[-/]\d{2}[-/]\d{4})', text)
                if date_match and result['tanggal_penerbitan'] is None:
                    result['tanggal_penerbitan'] = date_match.group(1)

    def extract(self, ocr_results: List[Dict]) -> Dict[str, Optional[str]]:
        """
        Ekstrak field KTP dari hasil OCR dengan template-based zone detection

        Args:
            ocr_results: List hasil dari OCREngine.extract_text()

        Returns:
            Dict dengan field KTP
        """
        result = {
            'nik': None,
            'nama': None,
            'tempat_lahir': None,
            'tanggal_lahir': None,
            'jenis_kelamin': None,
            'gol_darah': None,
            'alamat': None,
            'rt_rw': None,
            'kel_desa': None,
            'kecamatan': None,
            'agama': None,
            'status_perkawinan': None,
            'pekerjaan': None,
            'kewarganegaraan': None,
            'berlaku_hingga': 'SEUMUR HIDUP',  # Default sesuai peraturan pemerintah e-KTP
            'provinsi': None,
            'kabupaten_kota': None,
            'tanggal_penerbitan': None,
        }

        # Detect image dimensions from bounding boxes
        img_width, img_height = self._detect_image_size(ocr_results)

        # Assign zones to each OCR result
        zone_texts = {}  # zone_name -> list of texts
        for r in ocr_results:
            x_center = r.get('x_center', 0)
            y_center = r.get('y_center', 0)
            zone = self._get_zone(x_center, y_center, img_width, img_height)
            if zone:
                if zone not in zone_texts:
                    zone_texts[zone] = []
                zone_texts[zone].append(r['text'])

        # Debug: print zone assignments
        print("\n[DEBUG KTPExtractor] Zone assignments:")
        for zone, texts in zone_texts.items():
            print(f"  {zone}: {texts}")

        # Extract fields using zone-based approach
        self._extract_by_zones(zone_texts, result)

        # Gabungkan semua teks untuk fallback pattern matching
        texts = [r['text'].strip() for r in ocr_results]
        all_text = '\n'.join(texts)

        # Ekstrak NIK (16 digit) - bisa ada di mana saja
        nik_match = re.search(r'\b(\d{16})\b', all_text)
        if nik_match:
            result['nik'] = nik_match.group(1)
            print(f"  -> NIK found: {result['nik']}")

        # Fallback: Parse line by line for fields not found by zone
        for i, text in enumerate(texts):
            # Skip baris yang hanya berisi punctuation atau kosong
            text_stripped = text.strip()
            if not text_stripped or text_stripped in [':', '：', '.', '-', '/', '|']:
                continue
            # Skip baris yang terlalu pendek (hanya 1-2 karakter non-alfanumerik)
            if len(text_stripped) <= 2 and not any(c.isalnum() for c in text_stripped):
                continue

            text_lower = text.lower()

            # Normalize colons
            text_normalized = re.sub(self.COLON_PATTERN, ':', text)
            text_norm_lower = text_normalized.lower()

            # ===== PROVINSI =====
            if 'provinsi' in text_lower and result['provinsi'] is None:
                # Split by PROVINSI and take remainder
                split_prov = re.split(r'(?i)provinsi\s*', text, 1)
                if len(split_prov) > 1:
                    val = split_prov[1].strip()
                    # Check if it contains kabupaten/kota (merged line case)
                    if 'kabupaten' in val.lower() or 'kota' in val.lower():
                        parts = re.split(r'(?i)\s*(kabupaten|kota)', val)
                        val = parts[0].strip()

                    if val:
                        # Fuzzy match against valid provinces
                        best_match = self._find_best_match(val.upper(), self.PROVINSI_LIST, cutoff=0.6)
                        if best_match:
                            result['provinsi'] = best_match
                        else:
                            result['provinsi'] = val.upper()

                # Check for next line if current line only had 'PROVINSI'
                if result['provinsi'] is None and i + 1 < len(texts):
                    next_text = texts[i+1].strip()
                    next_lower = next_text.lower()
                    # Only take next line if it doesn't look like another field
                    if not any(kw in next_lower for kw in ['provinsi', 'kabupaten', 'kota', 'nik']):
                         # Fuzzy match next line
                        val = next_text.upper()
                        best_match = self._find_best_match(val, self.PROVINSI_LIST, cutoff=0.6)
                        if best_match:
                            result['provinsi'] = best_match
                        else:
                            result['provinsi'] = val

            # ===== KABUPATEN/KOTA =====
            if ('kabupaten' in text_lower or 'kota' in text_lower or 'jakarta' in text_lower) and result['kabupaten_kota'] is None:
                if 'provinsi' not in text_lower:  # Bukan bagian dari provinsi
                    # Split by KABUPATEN or KOTA and take remainder
                    split_kab = re.split(r'(?i)\s*(kabupaten|kota)\s*', text, 1)
                    if len(split_kab) > 1:
                         prefix = "KABUPATEN" if "kabupaten" in text_lower else "KOTA"
                         val = split_kab[-1].strip()
                         if val:
                             result['kabupaten_kota'] = f"{prefix} {val.upper()}"
                         else:
                             result['kabupaten_kota'] = text.strip().upper()
                    else:
                        result['kabupaten_kota'] = text.strip().upper()

            # ===== NAMA =====
            if result['nama'] is None and self._is_label_match(text, 'nama'):
                val = self._extract_after_label(text_normalized, 'nama')
                current_name = ""

                if val:
                    current_name = val.upper()

                # Loop check baris berikutnya for Name (handle 2-3 lines)
                offset = 1
                # Batasi maksimal 2 baris tambahan untuk Nama (total 3 baris)
                while i + offset < len(texts) and offset <= 2:
                    next_text = texts[i+offset].strip()
                    next_lower = next_text.lower()

                    is_stop = False

                    # 1. Check Stop Keywords (Field Labels below Name)
                    # Stop if next line is Tempat Lahir, Jenis Kelamin, Alamat, etc.
                    stop_keywords = ['tempat', 'lahir', 'tgl', 'jenis', 'kelamin', 'alamat', 'rt/rw', 'nik']
                    if any(kw in next_lower for kw in stop_keywords):
                        is_stop = True
                        print(f"  [NAMA STOP] Matched stop keyword in '{next_text}'")

                    # 2. Check Case Sensitivity (Heuristic)
                    if not is_stop:
                        letters = [c for c in next_text if c.isalpha()]
                        if letters:
                            upper_count = sum(1 for c in letters if c.isupper())
                            upper_ratio = upper_count / len(letters)
                            # If mostly lowercase/title case, likely a label (e.g. "Tempat Lahir")
                            if upper_ratio < 0.4 and len(letters) > 3:
                                is_stop = True
                                print(f"  [NAMA STOP] Likely Label based on Case (Ratio={upper_ratio:.2f})")

                    if not is_stop:
                        if len(next_text) > 2:
                             print(f"  [NAMA MERGE] Merging '{next_text}'")
                             if current_name:
                                 current_name += " " + next_text.upper()
                             else:
                                 current_name = next_text.upper()
                             offset += 1
                        else:
                             print(f"  [NAMA SKIP] Too short '{next_text}'")
                             # Kalau terlalu pendek (noise), boleh skip atau stop?
                             # Biasanya nama tidak putus jadi 1 huruf. Anggap stop utk aman, atau skip.
                             # Kita skip saja increment offset.
                             offset += 1
                    else:
                        break

                if current_name:
                    # Fix Spacing Issues (e.g. BAGUSGEDE -> BAGUS GEDE)
                    current_name = re.sub(r'(BAGUS)(GEDE)', r'\1 \2', current_name)
                    current_name = re.sub(r'(ANAK)(AGUNG)', r'\1 \2', current_name) # Common issue
                    result['nama'] = current_name

            # ===== TEMPAT/TANGGAL LAHIR =====
            # ... (starts around line 830 in original) ...

            # (Skipping down to ALAMAT section for the replacement block)
            # ... regex find ...

            # ===== ALAMAT ===== (dengan fuzzy label matching)
            if result['alamat'] is None and self._is_label_match(text, 'alamat'):
                val = self._extract_after_label(text_normalized, r'a{1,2}l{0,2}a?m{0,2}a?t')

                # Logic multi-line
                current_addr = ""
                if val:
                    current_addr = val.upper()

                # Loop check baris berikutnya (bisa ambil i+1, i+2, dst selama bukan label)
                offset = 1
                while i + offset < len(texts):
                    next_text = texts[i+offset].strip()
                    print(f"  [ALAMAT CHECK] Offset +{offset}: '{next_text}'")

                    next_lower = next_text.lower()
                    is_stop = False

                    # 1. Cek Pola RT/RW (angka/angka) -> Pasti STOP
                    if re.search(r'\d{3}\s*/\s*\d{3}', next_text) or re.match(r'^[.\-]+\s*/\s*[.\-]+$', next_text):
                         is_stop = True
                         print("  [ALAMAT STOP] Matched RT/RW pattern")

                    # 2. Cek Keywords Label Pembatas
                    elif any(next_lower.startswith(prefix) for prefix in ['rt/', 'rw', 'rt/rw', 'kel', 'desa', 'kec', 'agama', 'status', 'kawin']):
                         is_stop = True
                         print("  [ALAMAT STOP] Matched label prefix")

                    # 3. Cek Keywords Spesifik Full Word
                    elif any(kw in next_lower for kw in ['kelurahan', 'kecamatan', 'perkawinan', 'kewarganegaraan']):
                         is_stop = True
                         print("  [ALAMAT STOP] Matched distinct label word")

                    # 4. Check Case Sensitivity
                    if not is_stop:
                        letters = [c for c in next_text if c.isalpha()]
                        if letters:
                            upper_count = sum(1 for c in letters if c.isupper())
                            upper_ratio = upper_count / len(letters)
                            # Jika hampir semua huruf kecil/Title Case (ratio < 0.4), dicurigai sebagai Label
                            # Kecuali kata-kata pendek (< 5 chars)
                            if upper_ratio < 0.4 and len(letters) > 4:
                                is_stop = True
                                print(f"  [ALAMAT STOP] Detected Title Case/Lowercase (Ratio={upper_ratio:.2f}) -> Likely Label")

                    # Jika BUKAN pembatas, AMBIL sebagai lanjutan alamat
                    if not is_stop:
                        if len(next_text) > 1:
                            print(f"  [ALAMAT MERGE] Merging '{next_text}'")
                            if current_addr:
                                 current_addr += " " + next_text.upper()
                            else:
                                 current_addr = next_text.upper()
                            offset += 1 # Lanjut cek baris berikutnya
                        else:
                             print(f"  [ALAMAT SKIP] Line too short '{next_text}'")
                             offset += 1 # Skip noise, try next line? Or stop? usually skip noise is safer to continue
                    else:
                        print(f"  [ALAMAT STOP] Hit Stop Condition '{next_text}'")
                        break # Stop loop

                if current_addr:
                    result['alamat'] = current_addr

                if current_addr:
                    result['alamat'] = current_addr

            # ===== RT/RW =====
            # Relaxed pattern to handle -/- or 000/000
            if result['rt_rw'] is None:
                rt_rw_match = re.search(r'(\d{1,3}|-)\s*/\s*(\d{1,3}|-)', text)
                if rt_rw_match:
                    result['rt_rw'] = f"{rt_rw_match.group(1)}/{rt_rw_match.group(2)}"
                    print(f"  [RT/RW] Found {result['rt_rw']}")

            # ===== KELURAHAN/DESA =====
            if ('kel' in text_lower or 'desa' in text_lower) and 'kelamin' not in text_lower:
                if result['kel_desa'] is None:
                    val = self._extract_after_label(text_normalized, 'kel|desa')
                    if val:
                        result['kel_desa'] = val.upper()
                    elif i + 1 < len(texts):
                        result['kel_desa'] = texts[i+1].strip().upper()

            # ===== TEMPAT/TANGGAL LAHIR =====
            # Gunakan _is_label_match untuk fleksibilitas (e.g. Tempat/Tgl Lahir, Tmpt Lahir)
            if result['tempat_lahir'] is None and self._is_label_match(text, 'ttl'):
                print(f"  [TTL DEBUG] Matched Label on line {i}: '{text}'")
                # Regex pattern yang SANGAT fleksibel untuk label TTL
                # Menangani berbagai variasi: Tmpat/Tgl Lahir, Tempat. Tgl. Lahir, dll
                # Intinya: T...mp...t <junk> L...hir
                val = self._extract_after_label(text_normalized, r't[ea]m?p?a?t.*?l[a@]hi?r?|tgl.*?l[a@]hi?r?')

                # Jika val kosong, coba ambil dari baris berikutnya
                if not val and i + 1 < len(texts):
                    next_text = texts[i+1].strip()
                    next_lower = next_text.lower()
                    stop_keywords = ['jenis', 'kelamin', 'alamat', 'gol', 'darah']
                    if not any(kw in next_lower for kw in stop_keywords):
                        val = next_text.upper()
                        print(f"  [TTL DEBUG] Took next line: '{val}'")

                if val:
                    print(f"  [TTL DEBUG] Parsing value: '{val}'")
                    self._parse_ttl(val, result)
                    if result['tanggal_lahir']:
                        print(f"  [TTL DEBUG] Success: {result['tanggal_lahir']}")

            # ===== JENIS KELAMIN =====
            if result['jenis_kelamin'] is None:
                # 1. Coba cari Label dulu
                if self._is_label_match(text, 'jenis_kelamin'):
                    val = self._extract_after_label(text_normalized, r'j[ea]ni?s\s*k[ea]l[a@]?mi?n')
                    if val:
                        if 'LAKI' in val.upper(): result['jenis_kelamin'] = 'LAKI-LAKI'
                        elif 'PEREMPUAN' in val.upper() or 'WANITA' in val.upper(): result['jenis_kelamin'] = 'PEREMPUAN'

                    if result['jenis_kelamin'] is None and i + 1 < len(texts):
                         next_text = texts[i+1].upper()
                         if 'LAKI' in next_text: result['jenis_kelamin'] = 'LAKI-LAKI'
                         elif 'PEREMPUAN' in next_text or 'WANITA' in next_text: result['jenis_kelamin'] = 'PEREMPUAN'

                # 2. Fallback: Cari langsung keyword VALUES
                if result['jenis_kelamin'] is None:
                    text_upper = text.upper()
                    if 'LAKI-LAKI' in text_upper or 'LAKI - LAKI' in text_upper:
                         result['jenis_kelamin'] = 'LAKI-LAKI'
                    elif 'PEREMPUAN' in text_upper:
                         result['jenis_kelamin'] = 'PEREMPUAN'

            # ===== GOLONGAN DARAH =====
            if result['gol_darah'] is None:
                # Cek label
                if self._is_label_match(text, 'gol_darah'):
                     val = self._extract_after_label(text_normalized, r'g?o?l\.?\s*d?a?r?a?h')
                     # Jika label ketemu tapi val kosong, mungkin nempel (Gol.Darah : O)
                     # atau ada di baris ini
                     if val:
                         gd_match = re.search(r'([ABO]{1,2}[+\-]?)', val)
                         if gd_match:
                             result['gol_darah'] = gd_match.group(1).upper()
                     else:
                         # Coba cari pattern gol darah di baris yang sama dengan label
                         gd_match = re.search(r'([ABO]{1,2}[+\-]?)', text.upper().replace('0','O'))
                         if gd_match:
                             result['gol_darah'] = gd_match.group(1).upper()

                # Cek next line jika baris ini cuma label "Gol Darah"
                if result['gol_darah'] is None and self._is_label_match(text, 'gol_darah') and i+1 < len(texts):
                    next_text = texts[i+1].strip().upper()
                    if len(next_text) < 5: # Pendek, asumsi gol darah
                        gd_match = re.search(r'([ABO]{1,2}[+\-]?)', next_text)
                        if gd_match:
                            result['gol_darah'] = gd_match.group(1).upper()

            # ===== KECAMATAN =====
            if 'kecamatan' in text_lower or ('kec' in text_lower and 'kelamin' not in text_lower):
                if result['kecamatan'] is None:
                    val = self._extract_after_label(text_normalized, 'kecamatan|kec')
                    if val:
                        result['kecamatan'] = val.upper()
                    elif i + 1 < len(texts):
                        # Value on next line (real KTP pattern)
                        next_text = texts[i+1].strip()
                        if len(next_text) > 2 and not any(kw in next_text.lower() for kw in ['agama', 'status', 'pekerjaan']):
                            result['kecamatan'] = next_text.upper()

            # ===== AGAMA ===== (dengan fuzzy label matching)
            if self._is_label_match(text, 'agama'):
                val = self._extract_after_label(text_normalized, r'a?g{0,2}a?m{0,2}a')
                if val and result['agama'] is None:
                    result['agama'] = val.upper()
                elif result['agama'] is None and i + 1 < len(texts):
                    # Value on next line (real KTP pattern)
                    next_text = texts[i+1].strip().upper()
                    if next_text in ['ISLAM', 'KRISTEN', 'KATOLIK', 'HINDU', 'BUDHA', 'BUDDHA', 'KONGHUCU']:
                        result['agama'] = next_text
            else:
                # Check if line contains only agama name
                for agama in self.AGAMA_LIST:
                    if agama in text_lower and len(text) < 20:
                        if result['agama'] is None:
                            result['agama'] = text.strip().upper()
                            break

            # ===== STATUS PERKAWINAN =====
            if 'kawin' in text_lower:
                if result['status_perkawinan'] is None:
                    # Check against official list first
                    text_upper = text.upper().replace("BELUMKAWIN", "BELUM KAWIN")
                    for status in self.STATUS_PERKAWINAN_LIST:
                        if status in text_upper:
                            result['status_perkawinan'] = status
                            break

                    # Fallback to extraction if not found in list
                    if result['status_perkawinan'] is None:
                        val = self._extract_after_label(text_normalized, 'status.*kawin|perkawinan')
                        if val:
                            result['status_perkawinan'] = val.upper()

            # ===== PEKERJAAN =====
            if 'pekerjaan' in text_lower:
                val = self._extract_after_label(text_normalized, 'pekerjaan')
                if val and result['pekerjaan'] is None:
                    result['pekerjaan'] = val.upper()
                elif result['pekerjaan'] is None and i + 1 < len(texts):
                    # Value on next line (real KTP pattern)
                    next_text = texts[i+1].strip()
                    if len(next_text) > 2 and 'kewarganegaraan' not in next_text.lower():
                        result['pekerjaan'] = next_text.upper()
            else:
                # Check if line contains pekerjaan keyword
                for pekerjaan in self.PEKERJAAN_LIST:
                    if pekerjaan in text_lower and len(text) < 30:
                        if result['pekerjaan'] is None:
                            result['pekerjaan'] = text.strip().upper()
                            break

            # ===== KEWARGANEGARAAN =====
            if 'wni' in text_lower:
                result['kewarganegaraan'] = 'WNI'
            elif 'wna' in text_lower:
                result['kewarganegaraan'] = 'WNA'
            elif 'warga' in text_lower and result['kewarganegaraan'] is None:
                val = self._extract_after_label(text_normalized, 'kewarganegaraan|warga')
                if val:
                    result['kewarganegaraan'] = val.upper()

            # ===== BERLAKU HINGGA =====
            if 'berlaku' in text_lower or 'seumur' in text_lower:
                if result['berlaku_hingga'] is None:
                    if 'seumur' in text_lower or 'hidup' in text_lower:
                        result['berlaku_hingga'] = 'SEUMUR HIDUP'
                    else:
                        val = self._extract_after_label(text_normalized, 'berlaku')
                        if val:
                            result['berlaku_hingga'] = val.upper()

            # ===== TANGGAL PENERBITAN (biasanya format DD-MM-YYYY di akhir) =====
            # Look for date that is NOT tanggal lahir (different date)
            if result['tanggal_penerbitan'] is None:
                # 1. Skip if contains Keywords of other date fields
                # Jangan ambil jika ada kata 'LAHIR', 'TGL', 'BERLAKU', 'SEUMUR', 'HINGGA'
                line_clean = text.lower()
                if any(kw in line_clean for kw in ['lahir', 'lahlr', 'tgl', 'tempat', 'berlaku', 'seumur', 'hingga', 'hidup']):
                    pass # Skip
                else:
                    # Match date format at end of text or standalone date
                    date_match = re.search(r'(\d{2}[-\s/]\d{2}[-\s/]\d{4})$', text.strip())
                    if date_match:
                        found_date = date_match.group(1).replace(' ', '-')
                        # Make sure it's not the same as tanggal_lahir
                        if result['tanggal_lahir'] != found_date:
                            # Strict Position Check: MUST be in the bottom 30% of lines
                            # (Untuk menghindari salah ambil tanggal lahir yg mungkin gagal diparsing sbg TTL)
                            if i > len(texts) * 0.7:
                                result['tanggal_penerbitan'] = found_date
                                print(f"  [TGL TERBIT] Found '{found_date}' at index {i}/{len(texts)}")
                            else:
                                print(f"  [TGL TERBIT SKIP] Date '{found_date}' is too high ({i}/{len(texts)})")

        # ============================================
        # AGGRESSIVE SCAN: Cari agama dari semua teks OCR
        # ============================================
        # Indonesia hanya punya 6 agama resmi, mudah dideteksi
        if result['agama'] is None:
            # Daftar agama dengan variasi penulisan
            agama_patterns = {
                'ISLAM': ['ISLAM', 'ISLM', 'ISIAM', 'ISLAMI'],
                'KRISTEN': ['KRISTEN', 'KRISTEN PROTESTAN', 'PROTESTAN', 'KRISTN'],
                'KATOLIK': ['KATOLIK', 'KATHOLIK', 'KATHOLK', 'KATOLIK ROMA', 'KATOLIK.'],
                'HINDU': ['HINDU', 'HNDU', 'HINDU DHARMA', 'HINDHU'],
                'BUDDHA': ['BUDDHA', 'BUDHA', 'BUDDA', 'BUDDHIS'],
                'KONGHUCU': ['KONGHUCU', 'KHONGHUCU', 'KONGHUCHU', 'CONFUCIUS'],
            }

            for text in texts:
                text_upper = text.upper().strip()
                # Skip jika teks terlalu pendek atau terlalu panjang
                if len(text_upper) < 4 or len(text_upper) > 30:
                    continue

                for agama_std, variants in agama_patterns.items():
                    for variant in variants:
                        if variant in text_upper:
                            result['agama'] = agama_std
                            print(f"  [AGAMA SCAN] Found '{variant}' in '{text_upper}' -> {agama_std}")
                            break
                    if result['agama']:
                        break
                if result['agama']:
                    break

        # ============================================
        # AGGRESSIVE SCAN: Cari golongan darah dari semua teks OCR
        # ============================================
        # Golongan darah hanya 4: A, B, AB, O (dengan/tanpa rhesus +/-)
        if result['gol_darah'] is None:
            gol_darah_patterns = ['AB+', 'AB-', 'A+', 'A-', 'B+', 'B-', 'O+', 'O-', 'AB', 'A', 'B', 'O']

            for text in texts:
                text_upper = text.upper().strip()
                # Hapus punctuation umum
                text_clean = re.sub(r'[:\.\,\s]+', '', text_upper)
                # Konversi 0 (nol) menjadi O (huruf) - OCR sering salah baca
                text_clean = text_clean.replace('0', 'O')

                # Skip jika teks terlalu panjang (bukan gol darah)
                if len(text_clean) > 10:
                    continue

                # Cari match untuk gol darah (dari panjang ke pendek untuk prioritas AB sebelum A/B)
                for gol in gol_darah_patterns:
                    # Exact match setelah dibersihkan
                    if text_clean == gol:
                        result['gol_darah'] = gol
                        print(f"  [GOL DARAH SCAN] Found '{text_upper}' -> {gol}")
                        break
                    # Match dengan prefix GOL
                    if text_clean == f"GOL{gol}" or text_clean == f"GOLDARAH{gol}":
                        result['gol_darah'] = gol
                        print(f"  [GOL DARAH SCAN] Found '{text_upper}' -> {gol}")
                        break
                    # Match sebagai single character di akhir teks pendek
                    if len(text_clean) <= 3 and text_clean.endswith(gol):
                        result['gol_darah'] = gol
                        print(f"  [GOL DARAH SCAN] Found '{text_upper}' -> {gol}")
                        break

                if result['gol_darah']:
                    break

        # ============================================
        # AGGRESSIVE SCAN: Cari berlaku hingga dari semua teks OCR
        # ============================================
        if result['berlaku_hingga'] is None:
            for text in texts:
                text_upper = text.upper().strip()
                if 'SEUMUR' in text_upper or 'HIDUP' in text_upper:
                    result['berlaku_hingga'] = 'SEUMUR HIDUP'
                    print(f"  [BERLAKU SCAN] Found '{text_upper}' -> SEUMUR HIDUP")
                    break

        # Post-processing
        result = self._post_process(result)

        return result

    def _extract_after_label(self, text: str, label_pattern: str) -> Optional[str]:
        """Ekstrak nilai setelah label (supports various separators)"""
        patterns = [
            rf'(?:{label_pattern})\s*:\s*(.+)',  # label: value
            rf'(?:{label_pattern})\s+([A-Z0-9].+)',  # label VALUE (uppercase start)
        ]

        for pattern in patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                value = match.group(1).strip()
                # Remove trailing colon or label fragment
                value = re.sub(r'^[:\s]+', '', value)
                value = re.sub(r'\s*:\s*$', '', value)
                if value and len(value) > 1:
                    return value

        return None

    def _parse_ttl(self, ttl_text: str, result: Dict):
        """Parse tempat/tanggal lahir dari text"""
        ttl_text = ttl_text.strip()

        # Normalize dates where OCR missed dashes:
        # "05 08 1978" -> "05-08-1978"
        # "05 08-1978" -> "05-08-1978"
        # "05-08 1978" -> "05-08-1978"
        ttl_text = re.sub(r'(\d{2})[\s]+(\d{2})[\s]+(\d{4})', r'\1-\2-\3', ttl_text)
        ttl_text = re.sub(r'(\d{2})[\s]+(\d{2})[-/](\d{4})', r'\1-\2-\3', ttl_text)
        ttl_text = re.sub(r'(\d{2})[-/](\d{2})[\s]+(\d{4})', r'\1-\2-\3', ttl_text)

        # Handle "0508-1978" -> "05-08-1978" (Missing separator between day/month)
        ttl_text = re.sub(r'(\d{2})(\d{2})[-/](\d{4})', r'\1-\2-\3', ttl_text)

        # Handle 8-digit date without separator: "05081978" -> "05-08-1978"
        date_8digit = re.search(r'(\d{8})', ttl_text)
        if date_8digit:
            d = date_8digit.group(1)
            formatted = f"{d[:2]}-{d[2:4]}-{d[4:]}"
            ttl_text = ttl_text.replace(d, formatted)

        # Handle merged city+date like "JAKARTA05-08-1978" - add space before digits
        ttl_text = re.sub(r'([A-Z])(\d{2}[-/])', r'\1 \2', ttl_text, flags=re.IGNORECASE)
        # Handle merged city+date like "JAKARTA.05-08-1978" -> replace dot with space
        ttl_text = re.sub(r'([A-Z])\.(\d)', r'\1 \2', ttl_text, flags=re.IGNORECASE)

        # Format: "TEMPAT, DD-MM-YYYY" atau "TEMPAT DD-MM-YYYY"
        date_match = re.search(r'(\d{2}[-/]\d{2}[-/]\d{4})', ttl_text)
        if date_match:
            result['tanggal_lahir'] = date_match.group(1)
            # Tempat adalah bagian sebelum tanggal
            place = ttl_text[:date_match.start()].strip(' ,:-/.')
            # Clean up label remnants
            place = re.sub(r'^(tempat|tgl|lahir|：|:)[/\s:：]*', '', place, flags=re.IGNORECASE).strip()
            if place and len(place) > 2:
                result['tempat_lahir'] = place.upper()
        else:
            # Coba split by comma
            parts = ttl_text.split(',')
            if len(parts) >= 2:
                result['tempat_lahir'] = parts[0].strip().upper()
                result['tanggal_lahir'] = parts[1].strip()
            elif len(parts) == 1 and len(ttl_text) > 2:
                result['tempat_lahir'] = ttl_text.upper()

    def _post_process(self, result: Dict) -> Dict:
        """Post-processing hasil ekstraksi"""
        # Validasi NIK (harus 16 digit)
        if result['nik'] and not re.match(r'^\d{16}$', result['nik']):
            cleaned = re.sub(r'\D', '', result['nik'])
            if len(cleaned) == 16:
                result['nik'] = cleaned
            else:
                result['nik'] = None

        # Fix format tanggal lahir yang salah
        # Pattern: DDMM-YYYY (contoh: 1608-1976) -> DD-MM-YYYY (16-08-1976)
        if result['tanggal_lahir']:
            tl = result['tanggal_lahir']
            # Match DDMM-YYYY format (salah)
            wrong_format = re.match(r'^(\d{2})(\d{2})-(\d{4})$', tl)
            if wrong_format:
                result['tanggal_lahir'] = f"{wrong_format.group(1)}-{wrong_format.group(2)}-{wrong_format.group(3)}"
                print(f"  [DATE FIX] '{tl}' -> '{result['tanggal_lahir']}'")
            # Match DDMMYYYY format (tanpa separator)
            no_sep_format = re.match(r'^(\d{2})(\d{2})(\d{4})$', tl)
            if no_sep_format:
                result['tanggal_lahir'] = f"{no_sep_format.group(1)}-{no_sep_format.group(2)}-{no_sep_format.group(3)}"
                print(f"  [DATE FIX] '{tl}' -> '{result['tanggal_lahir']}'")

        # Clean all string values - remove leading colons and extra whitespace
        for field in result:
            if result[field] and isinstance(result[field], str):
                val = result[field]
                # Remove leading colons (standard and full-width)
                val = re.sub(r'^[\s:：]+', '', val)
                # Remove trailing colons
                val = re.sub(r'[\s:：]+$', '', val)
                # Remove double spaces
                val = re.sub(r'\s+', ' ', val)
                result[field] = val.strip()

        # Bersihkan label dari values
        for field in ['nama', 'alamat', 'tempat_lahir', 'kel_desa', 'kecamatan', 'agama', 'pekerjaan']:
            if result[field]:
                # Remove common labels yang ter-capture
                result[field] = re.sub(
                    r'^(NAMA|ALAMAT|TEMPAT|LAHIR|TGL|KEL|DESA|KELURAHAN|KECAMATAN|KEC|AGAMA|PEKERJAAN|STATUS)[\s:：]*',
                    '', result[field], flags=re.IGNORECASE
                ).strip()

        # Fix status perkawinan yang masih mengandung label
        if result['status_perkawinan']:
            sp = result['status_perkawinan']
            sp = re.sub(r'^(STATUS|PERKAWINAN)[\s:：]*', '', sp, flags=re.IGNORECASE).strip()
            result['status_perkawinan'] = sp

        # Fix berlaku hingga
        if result['berlaku_hingga']:
            bh = result['berlaku_hingga']
            bh = re.sub(r'^(BERLAKU|HINGGA)[\s:：]*', '', bh, flags=re.IGNORECASE).strip()
            if bh.upper() == 'HIDUP' or 'SEUMUR' in bh.upper():
                result['berlaku_hingga'] = 'SEUMUR HIDUP'
            else:
                result['berlaku_hingga'] = bh
        else:
            # Fallback: Sesuai peraturan pemerintah, e-KTP berlaku seumur hidup
            # Berlaku untuk e-KTP yang diterbitkan sejak 2011
            result['berlaku_hingga'] = 'SEUMUR HIDUP'
            print("  [FALLBACK] berlaku_hingga = SEUMUR HIDUP (peraturan pemerintah)")

        # ============================================
        # Parse nama Bali jika terdeteksi
        # ============================================
        # Deteksi apakah ini KTP Bali berdasarkan:
        # 1. Provinsi = BALI
        # 2. NIK dimulai dengan 51 (kode Bali)
        # 3. Nama mengandung komponen khas Bali (NI, I GUSTI, dll)
        is_bali = False
        if result.get('provinsi') and 'BALI' in result['provinsi'].upper():
            is_bali = True
        elif result.get('nik') and result['nik'].startswith('51'):
            is_bali = True
        elif result.get('nama'):
            nama_upper = result['nama'].upper()
            # Cek apakah nama dimulai dengan prefix Bali
            if nama_upper.startswith('NI') or nama_upper.startswith('IGUSTI') or \
               nama_upper.startswith('IDABAGUS') or nama_upper.startswith('IDAAYU') or \
               any(nama_upper.startswith(p) for p in ['GUSTI', 'WAYAN', 'MADE', 'NYOMAN', 'KETUT', 'PUTU', 'KADEK', 'KOMANG']):
                is_bali = True

        if is_bali and result.get('nama'):
            result['nama'] = self._parse_balinese_name(result['nama'])

        # ============================================
        # Validasi dan koreksi Agama
        # ============================================
        if result.get('agama'):
            agama = result['agama'].upper().strip()
            # Fuzzy match terhadap daftar agama valid
            agama_match = None
            best_ratio = 0
            for valid_agama in self.AGAMA_LIST:
                ratio = difflib.SequenceMatcher(None, agama, valid_agama.upper()).ratio()
                if ratio > best_ratio and ratio > 0.6:
                    best_ratio = ratio
                    agama_match = valid_agama.upper()

            if agama_match:
                if agama_match != agama:
                    print(f"  [AGAMA VALIDATE] '{agama}' -> '{agama_match}' (ratio={best_ratio:.2f})")
                result['agama'] = agama_match
        # Tidak ada fallback otomatis untuk agama - harus dari OCR

        # Fix merged kabupaten/kota names (e.g., JAKARTASELATAN -> JAKARTA SELATAN)
        if result['kabupaten_kota']:
            kk = result['kabupaten_kota']
            # Add space before directional words
            kk = re.sub(r'(JAKARTA|BANDUNG|SURABAYA|SEMARANG|MEDAN|BEKASI|TANGERANG|DEPOK|BOGOR)(SELATAN|UTARA|BARAT|TIMUR|PUSAT|TENGAH)',
                        r'\1 \2', kk, flags=re.IGNORECASE)
            # Common merged patterns
            kk = re.sub(r'(KOTA|KABUPATEN)([A-Z])', r'\1 \2', kk, flags=re.IGNORECASE)
            result['kabupaten_kota'] = kk.upper()

        # Fix merged provinsi names
        if result['provinsi']:
            prov = result['provinsi']
            prov = re.sub(r'(DKI|DI)(JAKARTA|YOGYAKARTA)', r'\1 \2', prov, flags=re.IGNORECASE)
            prov = re.sub(r'(JAWA|KALIMANTAN|SULAWESI|SUMATERA|NUSA TENGGARA)(BARAT|TIMUR|TENGAH|SELATAN|UTARA)',
                          r'\1 \2', prov, flags=re.IGNORECASE)
            result['provinsi'] = prov.upper()

        # Fix merged alamat/address (e.g., JLKECAPIV -> JL KECAPI V)
        if result['alamat']:
            alamat = result['alamat']
            # Add space after common street prefixes
            alamat = re.sub(r'^(JL|JLN|JALAN|GG|GANG|NO|BLOK)([A-Z])', r'\1 \2', alamat, flags=re.IGNORECASE)
            # Add space before Roman numerals at the end (I, II, III, IV, V, VI, VII, VIII, IX, X)
            alamat = re.sub(r'([A-Z])([IVX]+)$', r'\1 \2', alamat, flags=re.IGNORECASE)
            # Add space before single digits/numbers at end
            alamat = re.sub(r'([A-Z])(\d+)$', r'\1 \2', alamat, flags=re.IGNORECASE)
            # Fix common patterns: "NO123" -> "NO 123", "BLOKA" -> "BLOK A"
            alamat = re.sub(r'\b(NO|BLOK)(\d+|[A-Z])\b', r'\1 \2', alamat, flags=re.IGNORECASE)
            result['alamat'] = alamat.upper()

        # ============================================
        # Cross-validation: Tempat Lahir vs Kel/Desa
        # ============================================
        # Pada KTP, tempat lahir sering sama dengan desa/kelurahan
        # Jika tempat_lahir mirip dengan kel_desa, gunakan yang tervalidasi
        if result.get('tempat_lahir') and result.get('kel_desa'):
            tl = result['tempat_lahir'].upper()
            kd = result['kel_desa'].upper()

            # Hitung similarity
            ratio = difflib.SequenceMatcher(None, tl, kd).ratio()

            if ratio > 0.7:
                # Tempat lahir mirip dengan kel/desa, gunakan kel/desa yang sudah divalidasi
                print(f"  [CROSS-VALIDATE] Tempat Lahir '{tl}' mirip dengan Kel/Desa '{kd}' (ratio={ratio:.2f})")
                result['tempat_lahir'] = kd
            elif ratio > 0.5:
                # Cukup mirip, log untuk debugging
                print(f"  [CROSS-VALIDATE] Tempat Lahir '{tl}' mungkin sama dengan Kel/Desa '{kd}' (ratio={ratio:.2f})")

        # Jika tempat_lahir kosong tapi kel_desa ada, mungkin sama
        # (tidak otomatis mengisi karena bisa beda)

        return result


if __name__ == "__main__":
    # Test
    sample_ocr = [
        {'text': 'PROVINSI JAWA BARAT'},
        {'text': 'KABUPATEN BANDUNG'},
        {'text': 'NIK : 3204012345678901'},
        {'text': 'Nama : JOHN DOE'},
        {'text': 'Tempat/Tgl Lahir : BANDUNG, 01-01-1990'},
        {'text': 'Jenis Kelamin : LAKI-LAKI'},
        {'text': 'Alamat : JL. MERDEKA NO. 123'},
        {'text': 'RT/RW : 001/002'},
        {'text': 'Kel/Desa : SUKAMAJU'},
        {'text': 'Kecamatan : SUKASARI'},
        {'text': 'Agama : ISLAM'},
        {'text': 'Status Perkawinan : BELUM KAWIN'},
        {'text': 'Pekerjaan : KARYAWAN SWASTA'},
        {'text': 'Kewarganegaraan : WNI'},
        {'text': 'Berlaku Hingga : SEUMUR HIDUP'},
    ]

    extractor = KTPExtractor()
    result = extractor.extract(sample_ocr)

    for key, value in result.items():
        print(f"{key}: {value}")