""" KTP Field Extractor Ekstraksi data terstruktur dari hasil OCR KTP Indonesia Mendukung berbagai format output OCR (full-width colon, standard colon, tanpa colon) OPTIMIZED: Pre-compiled regex patterns for better performance """ import re from typing import Dict, Optional, List import difflib # Debug mode - set to False for production DEBUG_MODE = False class KTPExtractor: """Ekstrak field dari hasil OCR KTP""" # Pre-compiled regex patterns (optimization) COLON_PATTERN = re.compile(r'[::]') NIK_PATTERN = re.compile(r'\b(\d{16})\b') DATE_PATTERN = re.compile(r'(\d{2}[-/]\d{2}[-/]\d{4})') RT_RW_PATTERN = re.compile(r'(\d{3})\s*/\s*(\d{3})') GOL_DARAH_PATTERN = re.compile(r'([ABO]{1,2}[+\-]?)', re.IGNORECASE) PROVINSI_SPLIT_PATTERN = re.compile(r'(?i)provinsi\s*') KABUPATEN_SPLIT_PATTERN = re.compile(r'(?i)\s*(kabupaten|kota)\s*') TTL_PATTERN = re.compile(r'(?i)tempat[/\s]*tgl[/\s]*lahir|tempat[/\s]*lahir|lahir') # Pattern colon string (for backward compatibility) COLON_PATTERN_STR = r'[::]' # Daftar Provinsi Indonesia (38 Provinsi) PROVINSI_LIST = [ "ACEH", "SUMATERA UTARA", "SUMATERA BARAT", "RIAU", "JAMBI", "SUMATERA SELATAN", "BENGKULU", "LAMPUNG", "KEPULAUAN BANGKA BELITUNG", "KEPULAUAN RIAU", "DKI JAKARTA", "JAWA BARAT", "JAWA TENGAH", "DI YOGYAKARTA", "JAWA TIMUR", "BANTEN", "BALI", "NUSA TENGGARA BARAT", "NUSA TENGGARA TIMUR", "KALIMANTAN BARAT", "KALIMANTAN TENGAH", "KALIMANTAN SELATAN", "KALIMANTAN TIMUR", "KALIMANTAN UTARA", "SULAWESI UTARA", "SULAWESI TENGAH", "SULAWESI SELATAN", "SULAWESI TENGGARA", "GORONTALO", "SULAWESI BARAT", "MALUKU", "MALUKU UTARA", "PAPUA BARAT", "PAPUA", "PAPUA SELATAN", "PAPUA TENGAH", "PAPUA PEGUNUNGAN", "PAPUA BARAT DAYA" ] # Keywords untuk jenis kelamin MALE_KEYWORDS = ['laki', 'pria', 'male'] FEMALE_KEYWORDS = ['perempuan', 'wanita', 'female'] # Agama yang valid AGAMA_LIST = ['islam', 'kristen', 'katolik', 'hindu', 'budha', 'buddha', 'konghucu'] # Pekerjaan umum PEKERJAAN_LIST = ['pelajar', 'mahasiswa', 'pegawai', 'swasta', 'pns', 'wiraswasta', 'buruh', 'petani', 'nelayan', 'karyawan', 'ibu rumah tangga', 'tidak bekerja', 'lainnya', 'mengurus rumah tangga'] # Status Perkawinan yang valid STATUS_PERKAWINAN_LIST = ['BELUM KAWIN', 'KAWIN', 'CERAI HIDUP', 'CERAI MATI'] # Field Labels untuk fuzzy matching (mengatasi typo OCR seperti "Aamat" -> "ALAMAT") FIELD_LABELS = { 'nama': ['NAMA'], 'alamat': ['ALAMAT'], 'agama': ['AGAMA'], 'pekerjaan': ['PEKERJAAN'], 'kewarganegaraan': ['KEWARGANEGARAAN', 'WARGANEGARA'], 'tempat_lahir': ['TEMPAT', 'LAHIR', 'TEMPAT/TGL LAHIR'], 'jenis_kelamin': ['JENIS KELAMIN', 'JENIS', 'KELAMIN'], 'gol_darah': ['GOL. DARAH', 'GOL DARAH', 'GOLONGAN DARAH'], 'kel_desa': ['KEL/DESA', 'KELURAHAN', 'DESA'], 'kecamatan': ['KECAMATAN', 'KEC'], 'status_perkawinan': ['STATUS PERKAWINAN', 'PERKAWINAN'], 'berlaku_hingga': ['BERLAKU HINGGA', 'BERLAKU'], 'rt_rw': ['RT/RW', 'RT', 'RW'], } # ============================================ # Sistem Penamaan Hindu Bali # ============================================ # Struktur: [Prefix Gender] + [Gelar Kasta] + [Penanda Gender] + [Urutan Lahir] + [Nama Pribadi] # Prefix penanda gender (harus di awal nama) BALI_GENDER_PREFIX = { 'NI': 'PEREMPUAN', # Prefix untuk perempuan 'I': 'LAKI-LAKI', # Prefix untuk laki-laki } # Gelar Kasta (setelah prefix gender) BALI_KASTA = { 'IDA': 'BRAHMANA', 'GUSTI': 'KSATRIA', 'ANAK AGUNG': 'KSATRIA', 'COKORDA': 'KSATRIA', 'DEWA': 'KSATRIA', 'DESAK': 'KSATRIA', 'AGUNG': 'KSATRIA', 'NGAKAN': 'WAISYA', 'SANG': 'WAISYA', 'SI': 'WAISYA', } # Penanda gender tambahan (setelah kasta) BALI_GENDER_MARKER = { 'AYU': 'PEREMPUAN', 'ISTRI': 'PEREMPUAN', 'LUH': 'PEREMPUAN', 'BAGUS': 'LAKI-LAKI', 'GEDE': 'LAKI-LAKI', 'AGUS': 'LAKI-LAKI', 'ALIT': 'LAKI-LAKI', # Kecil/muda (untuk laki-laki) } # Urutan kelahiran (bersiklus setiap 4 anak) BALI_BIRTH_ORDER = { 'PUTU': 1, 'WAYAN': 1, 'GEDE': 1, 'ILUH': 1, 'MADE': 2, 'KADEK': 2, 'NENGAH': 2, 'NYOMAN': 3, 'KOMANG': 3, 'KETUT': 4, 'BALIK': 5, # Untuk anak ke-5+ (siklus ulang) } # Soroh/Klan Bali (identifikasi garis keturunan) BALI_SOROH = { 'PASEK': 'SOROH', # Klan mayoritas (~60% Hindu Bali) 'PANDE': 'SOROH', # Klan pandai besi/metalurgi 'ARYA': 'SOROH', # Klan Arya 'BENDESA': 'SOROH', # Pemimpin adat 'TANGKAS': 'SOROH', # Klan Tangkas 'CELAGI': 'SOROH', # Klan Celagi 'SENGGUHU': 'SOROH', # Klan Sengguhu 'KUBAYAN': 'SOROH', # Klan Kubayan 'BANDESA': 'SOROH', # Varian Bendesa } # Gabungkan semua komponen untuk deteksi (urut dari panjang ke pendek) BALI_NAME_COMPONENTS = [ # Prefix gender 'NI', 'I', # Kasta (prioritas: yang lebih panjang dulu) 'ANAK AGUNG', 'COKORDA', 'NGAKAN', 'IDA', 'GUSTI', 'DEWA', 'DESAK', 'AGUNG', 'SANG', 'SI', # Soroh/Klan 'PASEK', 'PANDE', 'ARYA', 'BENDESA', 'BANDESA', 'TANGKAS', 'CELAGI', 'SENGGUHU', 'KUBAYAN', # Gender marker 'AYU', 'ISTRI', 'LUH', 'BAGUS', 'GEDE', 'AGUS', 'ALIT', # Urutan lahir 'WAYAN', 'PUTU', 'ILUH', 'MADE', 'KADEK', 'NENGAH', 'NYOMAN', 'KOMANG', 'KETUT', 'BALIK', ] # KTP Zone Template (normalized coordinates: x_min, y_min, x_max, y_max) # Based on standard KTP layout ZONES = { 'header_provinsi': (0.15, 0.00, 0.85, 0.07), # PROVINSI header 'header_kabupaten': (0.15, 0.05, 0.85, 0.13), # KABUPATEN header 'nik': (0.02, 0.10, 0.70, 0.22), # NIK area 'nama': (0.02, 0.18, 0.70, 0.28), # Nama area 'ttl': (0.02, 0.25, 0.70, 0.36), # Tempat/Tgl Lahir 'jenis_kelamin': (0.02, 0.33, 0.45, 0.42), # Jenis Kelamin (left) 'gol_darah': (0.40, 0.33, 0.70, 0.42), # Gol Darah (right of jenis) 'alamat': (0.02, 0.38, 0.70, 0.50), # Alamat 'rt_rw': (0.02, 0.46, 0.70, 0.54), # RT/RW 'kel_desa': (0.02, 0.51, 0.70, 0.60), # Kel/Desa 'kecamatan': (0.02, 0.57, 0.70, 0.66), # Kecamatan 'agama': (0.02, 0.63, 0.70, 0.72), # Agama 'status': (0.02, 0.69, 0.70, 0.78), # Status Perkawinan 'pekerjaan': (0.02, 0.75, 0.70, 0.84), # Pekerjaan 'wni': (0.02, 0.81, 0.70, 0.90), # Kewarganegaraan 'berlaku': (0.02, 0.87, 0.70, 0.96), # Berlaku Hingga 'foto': (0.68, 0.10, 0.98, 0.55), # Foto (right side) 'penerbitan': (0.65, 0.58, 0.98, 0.98), # Tempat & Tanggal penerbitan } def __init__(self): self.image_width = 0 self.image_height = 0 def _get_zone(self, x_center: float, y_center: float, img_width: int, img_height: int) -> Optional[str]: """Determine which zone a text belongs to based on normalized coordinates""" if img_width == 0 or img_height == 0: return None # Normalize coordinates x_norm = x_center / img_width y_norm = y_center / img_height for zone_name, (x_min, y_min, x_max, y_max) in self.ZONES.items(): if x_min <= x_norm <= x_max and y_min <= y_norm <= y_max: return zone_name return None def _extract_value_from_text(self, text: str) -> str: """Extract value part from label:value text""" # Split by colon (standard or full-width) parts = re.split(r'[::]', text, 1) if len(parts) > 1: return parts[1].strip() return text.strip() def _find_best_match(self, text: str, candidates: List[str], cutoff: float = 0.6) -> Optional[str]: """Find best fuzzy match from candidates""" matches = difflib.get_close_matches(text, candidates, n=1, cutoff=cutoff) return matches[0] if matches else None def _is_label_match(self, text: str, field_name: str, cutoff: float = 0.7) -> bool: """ Fuzzy match untuk label field - mengatasi typo OCR seperti "Aamat" -> "ALAMAT" Returns True jika text cocok dengan salah satu label untuk field tersebut """ if not text or not text.strip(): return False if field_name not in self.FIELD_LABELS: return field_name.lower() in text.lower() text_upper = text.upper().strip() # Explicit conflict prevention if field_name == 'agama' and 'ALAMAT' in text_upper: return False if field_name == 'alamat' and 'AGAMA' in text_upper: return False # Coba exact match dulu (lebih cepat) for label in self.FIELD_LABELS[field_name]: if label in text_upper: return True # Fuzzy match jika tidak ada exact match # Ekstrak kata pertama dari text (biasanya label ada di awal) parts = text_upper.split(':')[0].split() if not parts: return False first_word = parts[0] for label in self.FIELD_LABELS[field_name]: label_parts = label.split() if not label_parts: continue # Bandingkan dengan kata pertama ratio = difflib.SequenceMatcher(None, first_word, label_parts[0]).ratio() # Dynamic cutoff logic effective_cutoff = cutoff if len(first_word) < 7: # Use stricter cutoff for short words to prevent ALAMAT (6) matching AGAMA (5) -> ratio 0.73 effective_cutoff = max(cutoff, 0.82) if ratio >= effective_cutoff: if DEBUG_MODE: print(f" [FUZZY LABEL] '{first_word}' matched '{label}' (ratio={ratio:.2f})") return True return False def _extract_after_label(self, text: str, label_pattern: str) -> Optional[str]: """ Extract content after a label (fuzzy/regex match). Handles cases with/without colons. """ if not text: return None # 1. Try Regex Search if pattern provided if label_pattern: # Construct regex: Label + optional spaces/colon + (Group 1: Value) # flags=re.IGNORECASE should be used # We want to find the END of the label match = re.search(f"({label_pattern})[:\\s]*", text, re.IGNORECASE) if match: # Return everything after the match end return text[match.end():].strip() return None def _parse_balinese_name(self, name: str) -> str: """ Parse nama Bali yang digabung OCR dan tambahkan spasi yang tepat. Contoh: "NIGUSTIAYUNYOMANSUWETRI" -> "NI GUSTI AYU NYOMAN SUWETRI" Struktur nama Bali: [Prefix Gender] + [Gelar Kasta] + [Penanda Gender] + [Urutan Lahir] + [Nama Pribadi] PENTING: Hanya proses jika nama benar-benar mengandung komponen Bali! """ if not name: return name name_upper = name.upper().strip() # Jika sudah ada spasi dengan jumlah wajar, kembalikan apa adanya if name_upper.count(' ') >= 2: return name_upper # Cek apakah nama mengandung komponen Bali # Nama harus dimulai dengan NI, I GUSTI, IDA, atau komponen urutan lahir Bali name_clean = name_upper.replace(' ', '') is_balinese_name = False # Cek prefix khas Bali if name_clean.startswith('NI') and len(name_clean) > 3: # NI harus diikuti komponen Bali lain (GUSTI, LUH, WAYAN, dll) after_ni = name_clean[2:] for comp in ['GUSTI', 'LUH', 'WAYAN', 'MADE', 'NYOMAN', 'KETUT', 'PUTU', 'KADEK', 'KOMANG', 'PASEK', 'PANDE']: if after_ni.startswith(comp): is_balinese_name = True break elif name_clean.startswith('IGUSTI') or name_clean.startswith('IDABAGUS') or name_clean.startswith('IDAAYU'): is_balinese_name = True elif any(name_clean.startswith(p) for p in ['GUSTI', 'WAYAN', 'PUTU', 'MADE', 'KADEK', 'NYOMAN', 'KOMANG', 'KETUT']): is_balinese_name = True if not is_balinese_name: # Bukan nama Bali, kembalikan dengan pemisahan spasi standar # Jika ada 1 spasi, kembalikan apa adanya if ' ' in name_upper: return name_upper # Jika tidak ada spasi sama sekali, kembalikan apa adanya (mungkin memang 1 kata) return name_upper # Urutan komponen yang akan dicari (dari yang terpanjang ke terpendek untuk akurasi) components_ordered = sorted(self.BALI_NAME_COMPONENTS, key=len, reverse=True) result_parts = [] remaining = name_clean # Parse prefix gender (NI atau I di awal) if remaining.startswith('NI'): result_parts.append('NI') remaining = remaining[2:] elif remaining.startswith('I') and len(remaining) > 1: # Pastikan bukan bagian dari kata lain next_char = remaining[1] if len(remaining) > 1 else '' # Cek apakah karakter setelah I adalah konsonan (bukan vokal) if next_char not in 'AIUEO': result_parts.append('I') remaining = remaining[1:] # Parse komponen-komponen lainnya found = True max_iterations = 10 # Prevent infinite loop iteration = 0 while remaining and found and iteration < max_iterations: found = False iteration += 1 for component in components_ordered: if remaining.startswith(component): # Skip jika komponen sudah ada di result (kecuali nama pribadi) if component not in result_parts or component not in self.BALI_NAME_COMPONENTS: result_parts.append(component) remaining = remaining[len(component):] found = True break # Sisa adalah nama pribadi if remaining: result_parts.append(remaining) parsed_name = ' '.join(result_parts) # Log jika ada perubahan if parsed_name != name_upper: print(f" [BALI NAME] '{name_upper}' -> '{parsed_name}'") return parsed_name def _search_best_match_in_text(self, text: str, candidates: List[str], prefix: str = "") -> tuple: """ Search if any candidate is present in text using multiple strategies: 1. Exact substring 2. Prefix + Candidate (Fuzzy) - e.g. "PROVINSI BALI" 3. Candidate Only (Fuzzy) - e.g. "BALI" (if prefix is missing/damaged) Returns (best_candidate, confidence_score) """ text_upper = text.upper() best_match = None best_ratio = 0.0 # Strategy 1: Exact substring match (fastest & most reliable) for candidate in candidates: if candidate in text_upper: if len(candidate) > len(best_match or ""): best_match = candidate best_ratio = 1.0 if best_ratio == 1.0: return best_match, best_ratio # Strategy 2: Prefix Construction & Fuzzy Match prefix_upper = prefix.upper() if prefix else "" # DEBUG: Print checking (controlled by DEBUG_MODE) if DEBUG_MODE: print(f"DEBUG Check Text: '{text_upper}' with Prefix: '{prefix_upper}'") for candidate in candidates: # 2a. Compare with Prefix + Space (e.g. "PROVINSI BALI") if prefix: target_spaced = f"{prefix_upper} {candidate}" s_spaced = difflib.SequenceMatcher(None, target_spaced, text_upper) ratio_spaced = s_spaced.ratio() # print(f" -> Compare '{target_spaced}' vs '{text_upper}' = {ratio_spaced:.2f}") if ratio_spaced > best_ratio and ratio_spaced > 0.5: best_ratio = ratio_spaced best_match = candidate # 2b. Compare with Prefix NO SPACE (e.g. "PROVINSIBALI") # This handles "PROVNSIBALI" perfectly target_merged = f"{prefix_upper}{candidate}" s_merged = difflib.SequenceMatcher(None, target_merged, text_upper) ratio_merged = s_merged.ratio() if DEBUG_MODE: print(f" -> Compare Merged '{target_merged}' vs '{text_upper}' = {ratio_merged:.2f}") if ratio_merged > best_ratio and ratio_merged > 0.5: best_ratio = ratio_merged best_match = candidate # 2c. Compare Candidate ONLY (e.g. "BALI") if len(candidate) > 3: s_raw = difflib.SequenceMatcher(None, candidate, text_upper) ratio_raw = s_raw.ratio() # print(f" -> Compare Raw '{candidate}' vs '{text_upper}' = {ratio_raw:.2f}") if ratio_raw > best_ratio and ratio_raw > 0.6: best_ratio = ratio_raw best_match = candidate if DEBUG_MODE: print(f"DEBUG Best Match: {best_match} ({best_ratio:.2f})") return best_match, best_ratio def _detect_image_size(self, ocr_results: List[Dict]) -> tuple: """Detect image dimensions from bounding boxes""" max_x, max_y = 0, 0 for r in ocr_results: bbox = r.get('bbox', []) if bbox and len(bbox) >= 4: for point in bbox: if len(point) >= 2: max_x = max(max_x, point[0]) max_y = max(max_y, point[1]) # Add some margin return (int(max_x * 1.05), int(max_y * 1.05)) if max_x > 0 else (1000, 640) def _extract_by_zones(self, zone_texts: Dict[str, List[str]], result: Dict): """Extract fields based on zone assignments""" # PROVINSI from header if 'header_provinsi' in zone_texts: print(f"DEBUG Zone Provinsi Content: {zone_texts['header_provinsi']}") for text in zone_texts['header_provinsi']: text_clean = text.strip() # Use prefix strategy: "PROVINSI " + result vs text match, score = self._search_best_match_in_text(text_clean, self.PROVINSI_LIST, prefix="PROVINSI") # LOWER THRESHOLD to 0.5 because "PROVINSI BALI" vs "PROVNSIBALI" is roughly 0.5-0.6 range if match and score > 0.5: result['provinsi'] = match # Remove the found province (and label) from text to see what's left # If we matched "PROVINSI JAWA TIMUR", the text might be "PROVNSIJAWATMRKABUPATENSUMENEP" # It's hard to cleanly remove "PROVISI JAWA TIMUR" if it was fuzzy matched. # BUT, we can try to find "KABUPATEN" or "KOTA" in the original text # independent of the province match if 'kabupaten' in text_clean.lower() or 'kota' in text_clean.lower(): parts = re.split(r'(?i)\s*(kabupaten|kota)', text_clean) if len(parts) > 1: kab_part = "".join(parts[1:]).strip() kab_val = re.sub(r'^(?i)(kabupaten|kota)\s*', '', kab_part).strip() if kab_val and result['kabupaten_kota'] is None: prefix = "KABUPATEN" if "kabupaten" in text_clean.lower() else "KOTA" result['kabupaten_kota'] = f"{prefix} {kab_val.upper()}" break # Fallback to keyword splitting (Legacy/Blurry fallback) text_lower = text.lower() val = text # If keyword exists, strip it if 'provinsi' in text_lower: split_prov = re.split(r'(?i)provinsi\s*', text, 1) if len(split_prov) > 1: val = split_prov[1].strip() else: val = "" # Check for merged text if 'kabupaten' in text_lower or 'kota' in text_lower: parts = re.split(r'(?i)\s*(kabupaten|kota)', val) val = parts[0].strip() if len(parts) > 1: kab_part = "".join(parts[1:]).strip() kab_val = re.sub(r'^(?i)(kabupaten|kota)\s*', '', kab_part).strip() if kab_val and result['kabupaten_kota'] is None: prefix = "KABUPATEN" if "kabupaten" in text_lower else "KOTA" result['kabupaten_kota'] = f"{prefix} {kab_val.upper()}" if val and len(val) > 2: # Try fuzzy match again on the cleaned value best_match = self._find_best_match(val.upper(), self.PROVINSI_LIST, cutoff=0.6) if best_match: result['provinsi'] = best_match else: result['provinsi'] = val.upper() break # KABUPATEN/KOTA from header if 'header_kabupaten' in zone_texts: for text in zone_texts['header_kabupaten']: text_lower = text.lower() val = text # Check keyword if 'kabupaten' in text_lower or 'kota' in text_lower: split_kab = re.split(r'(?i)\s*(kabupaten|kota)\s*', text, 1) if len(split_kab) > 1: val = split_kab[-1].strip() else: val = "" # If no keyword, but it's in the kabupaten zone, assume it's data if val: # Re-add prefix standard if we separated it or if it was missing # Heuristic: if validation suggests it's a known regency, we are good. # For now, standardize format. if result['kabupaten_kota'] is None: prefix = "KABUPATEN" if "kabupaten" in text_lower else "KOTA" # If no keyword found, default to KABUPATEN? Or better check Wilayah? # Let's default to detected keyword or KABUPATEN if "kota" in text_lower: prefix = "KOTA" else: prefix = "KABUPATEN" result['kabupaten_kota'] = f"{prefix} {val.upper()}" break # NAMA from nama zone (skip label line) if 'nama' in zone_texts: for text in zone_texts['nama']: text_lower = text.lower() if 'nama' not in text_lower and len(text) > 2: result['nama'] = text.upper() break elif 'nama' in text_lower: val = self._extract_value_from_text(text) if val and 'nama' not in val.lower(): result['nama'] = val.upper() # TTL from ttl zone if 'ttl' in zone_texts: for text in zone_texts['ttl']: # Skip if text is JUST the label (length check or fuzzy match) if len(text) < 15 and self._is_label_match(text, 'tempat_lahir'): continue if 'tempat' in text.lower() or 'lahir' in text.lower() or 'tgl' in text.lower() or len(text) > 5: val = self._extract_value_from_text(text) if val: # Don't accept if val looks like label if self._is_label_match(val, 'tempat_lahir') and len(val) < 20: continue self._parse_ttl(val, result) # Only break if we actually got a birth date, otherwise keep looking if result['tanggal_lahir']: break # JENIS KELAMIN if 'jenis_kelamin' in zone_texts: for text in zone_texts['jenis_kelamin']: text_lower = text.lower() if 'laki' in text_lower: result['jenis_kelamin'] = 'LAKI-LAKI' break elif 'perempuan' in text_lower or 'wanita' in text_lower: result['jenis_kelamin'] = 'PEREMPUAN' break # GOL DARAH if 'gol_darah' in zone_texts: for text in zone_texts['gol_darah']: gol_match = re.search(r'([ABO]{1,2}[+\-]?)', text, re.IGNORECASE) if gol_match: result['gol_darah'] = gol_match.group(1).upper() break # ALAMAT if 'alamat' in zone_texts: for text in zone_texts['alamat']: if 'alamat' not in text.lower() or len(zone_texts['alamat']) == 1: val = self._extract_value_from_text(text) if 'alamat' in text.lower() else text if val and 'alamat' not in val.lower(): result['alamat'] = val.upper() break # RT/RW if 'rt_rw' in zone_texts: for text in zone_texts['rt_rw']: rt_rw_match = re.search(r'(\d{3})\s*/\s*(\d{3})', text) if rt_rw_match: result['rt_rw'] = f"{rt_rw_match.group(1)}/{rt_rw_match.group(2)}" break # KEL/DESA if 'kel_desa' in zone_texts: for text in zone_texts['kel_desa']: if 'kel' in text.lower() or 'desa' in text.lower(): val = self._extract_value_from_text(text) if val and 'kel' not in val.lower(): result['kel_desa'] = val.upper() break elif result['kel_desa'] is None: # Fallback context: simple text result['kel_desa'] = text.upper() # KECAMATAN if 'kecamatan' in zone_texts: for text in zone_texts['kecamatan']: if 'kec' in text.lower(): val = self._extract_value_from_text(text) if val and 'kec' not in val.lower(): result['kecamatan'] = val.upper() break elif result['kecamatan'] is None: result['kecamatan'] = text.upper() # AGAMA if 'agama' in zone_texts: for text in zone_texts['agama']: val = text.upper() if 'agama' in text.lower(): val = self._extract_value_from_text(text).upper() # Verify against valid list for agama in self.AGAMA_LIST: if agama.upper() in val: result['agama'] = agama.upper() break if result['agama']: break # STATUS PERKAWINAN if 'status' in zone_texts: for text in zone_texts['status']: val = text.upper() # Normalize common OCR errors (e.g. BELUMKAWIN) val = val.replace("BELUMKAWIN", "BELUM KAWIN") # Check against official list found_status = False for status in self.STATUS_PERKAWINAN_LIST: if status in val: result['status_perkawinan'] = status found_status = True break if found_status: break # PEKERJAAN if 'pekerjaan' in zone_texts: best_job = None potential_job = None for text in zone_texts['pekerjaan']: val = text.upper() if 'pekerjaan' in text.lower(): val = self._extract_value_from_text(text).upper() # Clean up val = val.strip() if not val or len(val) < 3 or 'PEKERJAAN' in val: continue # 1. Check against wildcard/list (Priority) # Buruh, Karyawan, Pelajar, dll if any(job.upper() in val for job in self.PEKERJAAN_LIST): best_job = val break # Found a definitive job # 2. Save as potential if it's NOT a known bad value (like City names) # Avoid capturing 'TABANAN', 'JAKARTA', date strings if not any(city in val for city in ['KABUPATEN', 'KOTA', 'TABANAN', 'BADUNG', 'DENPASAR', 'JAKARTA', 'BANDUNG']): if not re.search(r'\d{2}-\d{2}-\d{4}', val): # Avoid dates if potential_job is None: potential_job = val if best_job: result['pekerjaan'] = best_job elif potential_job: result['pekerjaan'] = potential_job # WNI if 'wni' in zone_texts: for text in zone_texts['wni']: if 'wni' in text.lower(): result['kewarganegaraan'] = 'WNI' break elif 'wna' in text.lower(): result['kewarganegaraan'] = 'WNA' break # PENERBITAN area (tempat & tanggal dalam satu zona) if 'penerbitan' in zone_texts: for text in zone_texts['penerbitan']: # Look for date date_match = re.search(r'(\d{2}[-/]\d{2}[-/]\d{4})', text) if date_match and result['tanggal_penerbitan'] is None: result['tanggal_penerbitan'] = date_match.group(1) def extract(self, ocr_results: List[Dict]) -> Dict[str, Optional[str]]: """ Ekstrak field KTP dari hasil OCR dengan template-based zone detection Args: ocr_results: List hasil dari OCREngine.extract_text() Returns: Dict dengan field KTP """ result = { 'nik': None, 'nama': None, 'tempat_lahir': None, 'tanggal_lahir': None, 'jenis_kelamin': None, 'gol_darah': None, 'alamat': None, 'rt_rw': None, 'kel_desa': None, 'kecamatan': None, 'agama': None, 'status_perkawinan': None, 'pekerjaan': None, 'kewarganegaraan': None, 'berlaku_hingga': 'SEUMUR HIDUP', # Default sesuai peraturan pemerintah e-KTP 'provinsi': None, 'kabupaten_kota': None, 'tanggal_penerbitan': None, } # Detect image dimensions from bounding boxes img_width, img_height = self._detect_image_size(ocr_results) # Assign zones to each OCR result zone_texts = {} # zone_name -> list of texts for r in ocr_results: x_center = r.get('x_center', 0) y_center = r.get('y_center', 0) zone = self._get_zone(x_center, y_center, img_width, img_height) if zone: if zone not in zone_texts: zone_texts[zone] = [] zone_texts[zone].append(r['text']) # Debug: print zone assignments print("\n[DEBUG KTPExtractor] Zone assignments:") for zone, texts in zone_texts.items(): print(f" {zone}: {texts}") # Extract fields using zone-based approach self._extract_by_zones(zone_texts, result) # Gabungkan semua teks untuk fallback pattern matching texts = [r['text'].strip() for r in ocr_results] all_text = '\n'.join(texts) # Ekstrak NIK (16 digit) - bisa ada di mana saja nik_match = re.search(r'\b(\d{16})\b', all_text) if nik_match: result['nik'] = nik_match.group(1) print(f" -> NIK found: {result['nik']}") # Fallback: Parse line by line for fields not found by zone for i, text in enumerate(texts): # Skip baris yang hanya berisi punctuation atau kosong text_stripped = text.strip() if not text_stripped or text_stripped in [':', ':', '.', '-', '/', '|']: continue # Skip baris yang terlalu pendek (hanya 1-2 karakter non-alfanumerik) if len(text_stripped) <= 2 and not any(c.isalnum() for c in text_stripped): continue text_lower = text.lower() # Normalize colons text_normalized = re.sub(self.COLON_PATTERN, ':', text) text_norm_lower = text_normalized.lower() # ===== PROVINSI ===== if 'provinsi' in text_lower and result['provinsi'] is None: # Split by PROVINSI and take remainder split_prov = re.split(r'(?i)provinsi\s*', text, 1) if len(split_prov) > 1: val = split_prov[1].strip() # Check if it contains kabupaten/kota (merged line case) if 'kabupaten' in val.lower() or 'kota' in val.lower(): parts = re.split(r'(?i)\s*(kabupaten|kota)', val) val = parts[0].strip() if val: # Fuzzy match against valid provinces best_match = self._find_best_match(val.upper(), self.PROVINSI_LIST, cutoff=0.6) if best_match: result['provinsi'] = best_match else: result['provinsi'] = val.upper() # Check for next line if current line only had 'PROVINSI' if result['provinsi'] is None and i + 1 < len(texts): next_text = texts[i+1].strip() next_lower = next_text.lower() # Only take next line if it doesn't look like another field if not any(kw in next_lower for kw in ['provinsi', 'kabupaten', 'kota', 'nik']): # Fuzzy match next line val = next_text.upper() best_match = self._find_best_match(val, self.PROVINSI_LIST, cutoff=0.6) if best_match: result['provinsi'] = best_match else: result['provinsi'] = val # ===== KABUPATEN/KOTA ===== if ('kabupaten' in text_lower or 'kota' in text_lower or 'jakarta' in text_lower) and result['kabupaten_kota'] is None: if 'provinsi' not in text_lower: # Bukan bagian dari provinsi # Split by KABUPATEN or KOTA and take remainder split_kab = re.split(r'(?i)\s*(kabupaten|kota)\s*', text, 1) if len(split_kab) > 1: prefix = "KABUPATEN" if "kabupaten" in text_lower else "KOTA" val = split_kab[-1].strip() if val: result['kabupaten_kota'] = f"{prefix} {val.upper()}" else: result['kabupaten_kota'] = text.strip().upper() else: result['kabupaten_kota'] = text.strip().upper() # ===== NAMA ===== if result['nama'] is None and self._is_label_match(text, 'nama'): val = self._extract_after_label(text_normalized, 'nama') current_name = "" if val: current_name = val.upper() # Loop check baris berikutnya for Name (handle 2-3 lines) offset = 1 # Batasi maksimal 2 baris tambahan untuk Nama (total 3 baris) while i + offset < len(texts) and offset <= 2: next_text = texts[i+offset].strip() next_lower = next_text.lower() is_stop = False # 1. Check Stop Keywords (Field Labels below Name) # Stop if next line is Tempat Lahir, Jenis Kelamin, Alamat, etc. stop_keywords = ['tempat', 'lahir', 'tgl', 'jenis', 'kelamin', 'alamat', 'rt/rw', 'nik'] if any(kw in next_lower for kw in stop_keywords): is_stop = True print(f" [NAMA STOP] Matched stop keyword in '{next_text}'") # 2. Check Case Sensitivity (Heuristic) if not is_stop: letters = [c for c in next_text if c.isalpha()] if letters: upper_count = sum(1 for c in letters if c.isupper()) upper_ratio = upper_count / len(letters) # If mostly lowercase/title case, likely a label (e.g. "Tempat Lahir") if upper_ratio < 0.4 and len(letters) > 3: is_stop = True print(f" [NAMA STOP] Likely Label based on Case (Ratio={upper_ratio:.2f})") if not is_stop: if len(next_text) > 2: print(f" [NAMA MERGE] Merging '{next_text}'") if current_name: current_name += " " + next_text.upper() else: current_name = next_text.upper() offset += 1 else: print(f" [NAMA SKIP] Too short '{next_text}'") # Kalau terlalu pendek (noise), boleh skip atau stop? # Biasanya nama tidak putus jadi 1 huruf. Anggap stop utk aman, atau skip. # Kita skip saja increment offset. offset += 1 else: break if current_name: # Fix Spacing Issues (e.g. BAGUSGEDE -> BAGUS GEDE) current_name = re.sub(r'(BAGUS)(GEDE)', r'\1 \2', current_name) current_name = re.sub(r'(ANAK)(AGUNG)', r'\1 \2', current_name) # Common issue result['nama'] = current_name # ===== TEMPAT/TANGGAL LAHIR ===== # ... (starts around line 830 in original) ... # (Skipping down to ALAMAT section for the replacement block) # ... regex find ... # ===== ALAMAT ===== (dengan fuzzy label matching) if result['alamat'] is None and self._is_label_match(text, 'alamat'): val = self._extract_after_label(text_normalized, r'a{1,2}l{0,2}a?m{0,2}a?t') # Logic multi-line current_addr = "" if val: current_addr = val.upper() # Loop check baris berikutnya (bisa ambil i+1, i+2, dst selama bukan label) offset = 1 while i + offset < len(texts): next_text = texts[i+offset].strip() print(f" [ALAMAT CHECK] Offset +{offset}: '{next_text}'") next_lower = next_text.lower() is_stop = False # 1. Cek Pola RT/RW (angka/angka) -> Pasti STOP if re.search(r'\d{3}\s*/\s*\d{3}', next_text) or re.match(r'^[.\-]+\s*/\s*[.\-]+$', next_text): is_stop = True print(" [ALAMAT STOP] Matched RT/RW pattern") # 2. Cek Keywords Label Pembatas elif any(next_lower.startswith(prefix) for prefix in ['rt/', 'rw', 'rt/rw', 'kel', 'desa', 'kec', 'agama', 'status', 'kawin']): is_stop = True print(" [ALAMAT STOP] Matched label prefix") # 3. Cek Keywords Spesifik Full Word elif any(kw in next_lower for kw in ['kelurahan', 'kecamatan', 'perkawinan', 'kewarganegaraan']): is_stop = True print(" [ALAMAT STOP] Matched distinct label word") # 4. Check Case Sensitivity if not is_stop: letters = [c for c in next_text if c.isalpha()] if letters: upper_count = sum(1 for c in letters if c.isupper()) upper_ratio = upper_count / len(letters) # Jika hampir semua huruf kecil/Title Case (ratio < 0.4), dicurigai sebagai Label # Kecuali kata-kata pendek (< 5 chars) if upper_ratio < 0.4 and len(letters) > 4: is_stop = True print(f" [ALAMAT STOP] Detected Title Case/Lowercase (Ratio={upper_ratio:.2f}) -> Likely Label") # Jika BUKAN pembatas, AMBIL sebagai lanjutan alamat if not is_stop: if len(next_text) > 1: print(f" [ALAMAT MERGE] Merging '{next_text}'") if current_addr: current_addr += " " + next_text.upper() else: current_addr = next_text.upper() offset += 1 # Lanjut cek baris berikutnya else: print(f" [ALAMAT SKIP] Line too short '{next_text}'") offset += 1 # Skip noise, try next line? Or stop? usually skip noise is safer to continue else: print(f" [ALAMAT STOP] Hit Stop Condition '{next_text}'") break # Stop loop if current_addr: result['alamat'] = current_addr if current_addr: result['alamat'] = current_addr # ===== RT/RW ===== # Relaxed pattern to handle -/- or 000/000 if result['rt_rw'] is None: rt_rw_match = re.search(r'(\d{1,3}|-)\s*/\s*(\d{1,3}|-)', text) if rt_rw_match: result['rt_rw'] = f"{rt_rw_match.group(1)}/{rt_rw_match.group(2)}" print(f" [RT/RW] Found {result['rt_rw']}") # ===== KELURAHAN/DESA ===== if ('kel' in text_lower or 'desa' in text_lower) and 'kelamin' not in text_lower: if result['kel_desa'] is None: val = self._extract_after_label(text_normalized, 'kel|desa') if val: result['kel_desa'] = val.upper() elif i + 1 < len(texts): result['kel_desa'] = texts[i+1].strip().upper() # ===== TEMPAT/TANGGAL LAHIR ===== # Gunakan _is_label_match untuk fleksibilitas (e.g. Tempat/Tgl Lahir, Tmpt Lahir) if result['tempat_lahir'] is None and self._is_label_match(text, 'ttl'): print(f" [TTL DEBUG] Matched Label on line {i}: '{text}'") # Regex pattern yang SANGAT fleksibel untuk label TTL # Menangani berbagai variasi: Tmpat/Tgl Lahir, Tempat. Tgl. Lahir, dll # Intinya: T...mp...t L...hir val = self._extract_after_label(text_normalized, r't[ea]m?p?a?t.*?l[a@]hi?r?|tgl.*?l[a@]hi?r?') # Jika val kosong, coba ambil dari baris berikutnya if not val and i + 1 < len(texts): next_text = texts[i+1].strip() next_lower = next_text.lower() stop_keywords = ['jenis', 'kelamin', 'alamat', 'gol', 'darah'] if not any(kw in next_lower for kw in stop_keywords): val = next_text.upper() print(f" [TTL DEBUG] Took next line: '{val}'") if val: print(f" [TTL DEBUG] Parsing value: '{val}'") self._parse_ttl(val, result) if result['tanggal_lahir']: print(f" [TTL DEBUG] Success: {result['tanggal_lahir']}") # ===== JENIS KELAMIN ===== if result['jenis_kelamin'] is None: # 1. Coba cari Label dulu if self._is_label_match(text, 'jenis_kelamin'): val = self._extract_after_label(text_normalized, r'j[ea]ni?s\s*k[ea]l[a@]?mi?n') if val: if 'LAKI' in val.upper(): result['jenis_kelamin'] = 'LAKI-LAKI' elif 'PEREMPUAN' in val.upper() or 'WANITA' in val.upper(): result['jenis_kelamin'] = 'PEREMPUAN' if result['jenis_kelamin'] is None and i + 1 < len(texts): next_text = texts[i+1].upper() if 'LAKI' in next_text: result['jenis_kelamin'] = 'LAKI-LAKI' elif 'PEREMPUAN' in next_text or 'WANITA' in next_text: result['jenis_kelamin'] = 'PEREMPUAN' # 2. Fallback: Cari langsung keyword VALUES if result['jenis_kelamin'] is None: text_upper = text.upper() if 'LAKI-LAKI' in text_upper or 'LAKI - LAKI' in text_upper: result['jenis_kelamin'] = 'LAKI-LAKI' elif 'PEREMPUAN' in text_upper: result['jenis_kelamin'] = 'PEREMPUAN' # ===== GOLONGAN DARAH ===== if result['gol_darah'] is None: # Cek label if self._is_label_match(text, 'gol_darah'): val = self._extract_after_label(text_normalized, r'g?o?l\.?\s*d?a?r?a?h') # Jika label ketemu tapi val kosong, mungkin nempel (Gol.Darah : O) # atau ada di baris ini if val: gd_match = re.search(r'([ABO]{1,2}[+\-]?)', val) if gd_match: result['gol_darah'] = gd_match.group(1).upper() else: # Coba cari pattern gol darah di baris yang sama dengan label gd_match = re.search(r'([ABO]{1,2}[+\-]?)', text.upper().replace('0','O')) if gd_match: result['gol_darah'] = gd_match.group(1).upper() # Cek next line jika baris ini cuma label "Gol Darah" if result['gol_darah'] is None and self._is_label_match(text, 'gol_darah') and i+1 < len(texts): next_text = texts[i+1].strip().upper() if len(next_text) < 5: # Pendek, asumsi gol darah gd_match = re.search(r'([ABO]{1,2}[+\-]?)', next_text) if gd_match: result['gol_darah'] = gd_match.group(1).upper() # ===== KECAMATAN ===== if 'kecamatan' in text_lower or ('kec' in text_lower and 'kelamin' not in text_lower): if result['kecamatan'] is None: val = self._extract_after_label(text_normalized, 'kecamatan|kec') if val: result['kecamatan'] = val.upper() elif i + 1 < len(texts): # Value on next line (real KTP pattern) next_text = texts[i+1].strip() if len(next_text) > 2 and not any(kw in next_text.lower() for kw in ['agama', 'status', 'pekerjaan']): result['kecamatan'] = next_text.upper() # ===== AGAMA ===== (dengan fuzzy label matching) if self._is_label_match(text, 'agama'): val = self._extract_after_label(text_normalized, r'a?g{0,2}a?m{0,2}a') if val and result['agama'] is None: result['agama'] = val.upper() elif result['agama'] is None and i + 1 < len(texts): # Value on next line (real KTP pattern) next_text = texts[i+1].strip().upper() if next_text in ['ISLAM', 'KRISTEN', 'KATOLIK', 'HINDU', 'BUDHA', 'BUDDHA', 'KONGHUCU']: result['agama'] = next_text else: # Check if line contains only agama name for agama in self.AGAMA_LIST: if agama in text_lower and len(text) < 20: if result['agama'] is None: result['agama'] = text.strip().upper() break # ===== STATUS PERKAWINAN ===== if 'kawin' in text_lower: if result['status_perkawinan'] is None: # Check against official list first text_upper = text.upper().replace("BELUMKAWIN", "BELUM KAWIN") for status in self.STATUS_PERKAWINAN_LIST: if status in text_upper: result['status_perkawinan'] = status break # Fallback to extraction if not found in list if result['status_perkawinan'] is None: val = self._extract_after_label(text_normalized, 'status.*kawin|perkawinan') if val: result['status_perkawinan'] = val.upper() # ===== PEKERJAAN ===== if 'pekerjaan' in text_lower: val = self._extract_after_label(text_normalized, 'pekerjaan') if val and result['pekerjaan'] is None: result['pekerjaan'] = val.upper() elif result['pekerjaan'] is None and i + 1 < len(texts): # Value on next line (real KTP pattern) next_text = texts[i+1].strip() if len(next_text) > 2 and 'kewarganegaraan' not in next_text.lower(): result['pekerjaan'] = next_text.upper() else: # Check if line contains pekerjaan keyword for pekerjaan in self.PEKERJAAN_LIST: if pekerjaan in text_lower and len(text) < 30: if result['pekerjaan'] is None: result['pekerjaan'] = text.strip().upper() break # ===== KEWARGANEGARAAN ===== if 'wni' in text_lower: result['kewarganegaraan'] = 'WNI' elif 'wna' in text_lower: result['kewarganegaraan'] = 'WNA' elif 'warga' in text_lower and result['kewarganegaraan'] is None: val = self._extract_after_label(text_normalized, 'kewarganegaraan|warga') if val: result['kewarganegaraan'] = val.upper() # ===== BERLAKU HINGGA ===== if 'berlaku' in text_lower or 'seumur' in text_lower: if result['berlaku_hingga'] is None: if 'seumur' in text_lower or 'hidup' in text_lower: result['berlaku_hingga'] = 'SEUMUR HIDUP' else: val = self._extract_after_label(text_normalized, 'berlaku') if val: result['berlaku_hingga'] = val.upper() # ===== TANGGAL PENERBITAN (biasanya format DD-MM-YYYY di akhir) ===== # Look for date that is NOT tanggal lahir (different date) if result['tanggal_penerbitan'] is None: # 1. Skip if contains Keywords of other date fields # Jangan ambil jika ada kata 'LAHIR', 'TGL', 'BERLAKU', 'SEUMUR', 'HINGGA' line_clean = text.lower() if any(kw in line_clean for kw in ['lahir', 'lahlr', 'tgl', 'tempat', 'berlaku', 'seumur', 'hingga', 'hidup']): pass # Skip else: # Match date format at end of text or standalone date date_match = re.search(r'(\d{2}[-\s/]\d{2}[-\s/]\d{4})$', text.strip()) if date_match: found_date = date_match.group(1).replace(' ', '-') # Make sure it's not the same as tanggal_lahir if result['tanggal_lahir'] != found_date: # Strict Position Check: MUST be in the bottom 30% of lines # (Untuk menghindari salah ambil tanggal lahir yg mungkin gagal diparsing sbg TTL) if i > len(texts) * 0.7: result['tanggal_penerbitan'] = found_date print(f" [TGL TERBIT] Found '{found_date}' at index {i}/{len(texts)}") else: print(f" [TGL TERBIT SKIP] Date '{found_date}' is too high ({i}/{len(texts)})") # ============================================ # AGGRESSIVE SCAN: Cari agama dari semua teks OCR # ============================================ # Indonesia hanya punya 6 agama resmi, mudah dideteksi if result['agama'] is None: # Daftar agama dengan variasi penulisan agama_patterns = { 'ISLAM': ['ISLAM', 'ISLM', 'ISIAM', 'ISLAMI'], 'KRISTEN': ['KRISTEN', 'KRISTEN PROTESTAN', 'PROTESTAN', 'KRISTN'], 'KATOLIK': ['KATOLIK', 'KATHOLIK', 'KATHOLK', 'KATOLIK ROMA', 'KATOLIK.'], 'HINDU': ['HINDU', 'HNDU', 'HINDU DHARMA', 'HINDHU'], 'BUDDHA': ['BUDDHA', 'BUDHA', 'BUDDA', 'BUDDHIS'], 'KONGHUCU': ['KONGHUCU', 'KHONGHUCU', 'KONGHUCHU', 'CONFUCIUS'], } for text in texts: text_upper = text.upper().strip() # Skip jika teks terlalu pendek atau terlalu panjang if len(text_upper) < 4 or len(text_upper) > 30: continue for agama_std, variants in agama_patterns.items(): for variant in variants: if variant in text_upper: result['agama'] = agama_std print(f" [AGAMA SCAN] Found '{variant}' in '{text_upper}' -> {agama_std}") break if result['agama']: break if result['agama']: break # ============================================ # AGGRESSIVE SCAN: Cari golongan darah dari semua teks OCR # ============================================ # Golongan darah hanya 4: A, B, AB, O (dengan/tanpa rhesus +/-) if result['gol_darah'] is None: gol_darah_patterns = ['AB+', 'AB-', 'A+', 'A-', 'B+', 'B-', 'O+', 'O-', 'AB', 'A', 'B', 'O'] for text in texts: text_upper = text.upper().strip() # Hapus punctuation umum text_clean = re.sub(r'[:\.\,\s]+', '', text_upper) # Konversi 0 (nol) menjadi O (huruf) - OCR sering salah baca text_clean = text_clean.replace('0', 'O') # Skip jika teks terlalu panjang (bukan gol darah) if len(text_clean) > 10: continue # Cari match untuk gol darah (dari panjang ke pendek untuk prioritas AB sebelum A/B) for gol in gol_darah_patterns: # Exact match setelah dibersihkan if text_clean == gol: result['gol_darah'] = gol print(f" [GOL DARAH SCAN] Found '{text_upper}' -> {gol}") break # Match dengan prefix GOL if text_clean == f"GOL{gol}" or text_clean == f"GOLDARAH{gol}": result['gol_darah'] = gol print(f" [GOL DARAH SCAN] Found '{text_upper}' -> {gol}") break # Match sebagai single character di akhir teks pendek if len(text_clean) <= 3 and text_clean.endswith(gol): result['gol_darah'] = gol print(f" [GOL DARAH SCAN] Found '{text_upper}' -> {gol}") break if result['gol_darah']: break # ============================================ # AGGRESSIVE SCAN: Cari berlaku hingga dari semua teks OCR # ============================================ if result['berlaku_hingga'] is None: for text in texts: text_upper = text.upper().strip() if 'SEUMUR' in text_upper or 'HIDUP' in text_upper: result['berlaku_hingga'] = 'SEUMUR HIDUP' print(f" [BERLAKU SCAN] Found '{text_upper}' -> SEUMUR HIDUP") break # Post-processing result = self._post_process(result) return result def _extract_after_label(self, text: str, label_pattern: str) -> Optional[str]: """Ekstrak nilai setelah label (supports various separators)""" patterns = [ rf'(?:{label_pattern})\s*:\s*(.+)', # label: value rf'(?:{label_pattern})\s+([A-Z0-9].+)', # label VALUE (uppercase start) ] for pattern in patterns: match = re.search(pattern, text, re.IGNORECASE) if match: value = match.group(1).strip() # Remove trailing colon or label fragment value = re.sub(r'^[:\s]+', '', value) value = re.sub(r'\s*:\s*$', '', value) if value and len(value) > 1: return value return None def _parse_ttl(self, ttl_text: str, result: Dict): """Parse tempat/tanggal lahir dari text""" ttl_text = ttl_text.strip() # Normalize dates where OCR missed dashes: # "05 08 1978" -> "05-08-1978" # "05 08-1978" -> "05-08-1978" # "05-08 1978" -> "05-08-1978" ttl_text = re.sub(r'(\d{2})[\s]+(\d{2})[\s]+(\d{4})', r'\1-\2-\3', ttl_text) ttl_text = re.sub(r'(\d{2})[\s]+(\d{2})[-/](\d{4})', r'\1-\2-\3', ttl_text) ttl_text = re.sub(r'(\d{2})[-/](\d{2})[\s]+(\d{4})', r'\1-\2-\3', ttl_text) # Handle "0508-1978" -> "05-08-1978" (Missing separator between day/month) ttl_text = re.sub(r'(\d{2})(\d{2})[-/](\d{4})', r'\1-\2-\3', ttl_text) # Handle 8-digit date without separator: "05081978" -> "05-08-1978" date_8digit = re.search(r'(\d{8})', ttl_text) if date_8digit: d = date_8digit.group(1) formatted = f"{d[:2]}-{d[2:4]}-{d[4:]}" ttl_text = ttl_text.replace(d, formatted) # Handle merged city+date like "JAKARTA05-08-1978" - add space before digits ttl_text = re.sub(r'([A-Z])(\d{2}[-/])', r'\1 \2', ttl_text, flags=re.IGNORECASE) # Handle merged city+date like "JAKARTA.05-08-1978" -> replace dot with space ttl_text = re.sub(r'([A-Z])\.(\d)', r'\1 \2', ttl_text, flags=re.IGNORECASE) # Format: "TEMPAT, DD-MM-YYYY" atau "TEMPAT DD-MM-YYYY" date_match = re.search(r'(\d{2}[-/]\d{2}[-/]\d{4})', ttl_text) if date_match: result['tanggal_lahir'] = date_match.group(1) # Tempat adalah bagian sebelum tanggal place = ttl_text[:date_match.start()].strip(' ,:-/.') # Clean up label remnants place = re.sub(r'^(tempat|tgl|lahir|:|:)[/\s::]*', '', place, flags=re.IGNORECASE).strip() if place and len(place) > 2: result['tempat_lahir'] = place.upper() else: # Coba split by comma parts = ttl_text.split(',') if len(parts) >= 2: result['tempat_lahir'] = parts[0].strip().upper() result['tanggal_lahir'] = parts[1].strip() elif len(parts) == 1 and len(ttl_text) > 2: result['tempat_lahir'] = ttl_text.upper() def _post_process(self, result: Dict) -> Dict: """Post-processing hasil ekstraksi""" # Validasi NIK (harus 16 digit) if result['nik'] and not re.match(r'^\d{16}$', result['nik']): cleaned = re.sub(r'\D', '', result['nik']) if len(cleaned) == 16: result['nik'] = cleaned else: result['nik'] = None # Fix format tanggal lahir yang salah # Pattern: DDMM-YYYY (contoh: 1608-1976) -> DD-MM-YYYY (16-08-1976) if result['tanggal_lahir']: tl = result['tanggal_lahir'] # Match DDMM-YYYY format (salah) wrong_format = re.match(r'^(\d{2})(\d{2})-(\d{4})$', tl) if wrong_format: result['tanggal_lahir'] = f"{wrong_format.group(1)}-{wrong_format.group(2)}-{wrong_format.group(3)}" print(f" [DATE FIX] '{tl}' -> '{result['tanggal_lahir']}'") # Match DDMMYYYY format (tanpa separator) no_sep_format = re.match(r'^(\d{2})(\d{2})(\d{4})$', tl) if no_sep_format: result['tanggal_lahir'] = f"{no_sep_format.group(1)}-{no_sep_format.group(2)}-{no_sep_format.group(3)}" print(f" [DATE FIX] '{tl}' -> '{result['tanggal_lahir']}'") # Clean all string values - remove leading colons and extra whitespace for field in result: if result[field] and isinstance(result[field], str): val = result[field] # Remove leading colons (standard and full-width) val = re.sub(r'^[\s::]+', '', val) # Remove trailing colons val = re.sub(r'[\s::]+$', '', val) # Remove double spaces val = re.sub(r'\s+', ' ', val) result[field] = val.strip() # Bersihkan label dari values for field in ['nama', 'alamat', 'tempat_lahir', 'kel_desa', 'kecamatan', 'agama', 'pekerjaan']: if result[field]: # Remove common labels yang ter-capture result[field] = re.sub( r'^(NAMA|ALAMAT|TEMPAT|LAHIR|TGL|KEL|DESA|KELURAHAN|KECAMATAN|KEC|AGAMA|PEKERJAAN|STATUS)[\s::]*', '', result[field], flags=re.IGNORECASE ).strip() # Fix status perkawinan yang masih mengandung label if result['status_perkawinan']: sp = result['status_perkawinan'] sp = re.sub(r'^(STATUS|PERKAWINAN)[\s::]*', '', sp, flags=re.IGNORECASE).strip() result['status_perkawinan'] = sp # Fix berlaku hingga if result['berlaku_hingga']: bh = result['berlaku_hingga'] bh = re.sub(r'^(BERLAKU|HINGGA)[\s::]*', '', bh, flags=re.IGNORECASE).strip() if bh.upper() == 'HIDUP' or 'SEUMUR' in bh.upper(): result['berlaku_hingga'] = 'SEUMUR HIDUP' else: result['berlaku_hingga'] = bh else: # Fallback: Sesuai peraturan pemerintah, e-KTP berlaku seumur hidup # Berlaku untuk e-KTP yang diterbitkan sejak 2011 result['berlaku_hingga'] = 'SEUMUR HIDUP' print(" [FALLBACK] berlaku_hingga = SEUMUR HIDUP (peraturan pemerintah)") # ============================================ # Parse nama Bali jika terdeteksi # ============================================ # Deteksi apakah ini KTP Bali berdasarkan: # 1. Provinsi = BALI # 2. NIK dimulai dengan 51 (kode Bali) # 3. Nama mengandung komponen khas Bali (NI, I GUSTI, dll) is_bali = False if result.get('provinsi') and 'BALI' in result['provinsi'].upper(): is_bali = True elif result.get('nik') and result['nik'].startswith('51'): is_bali = True elif result.get('nama'): nama_upper = result['nama'].upper() # Cek apakah nama dimulai dengan prefix Bali if nama_upper.startswith('NI') or nama_upper.startswith('IGUSTI') or \ nama_upper.startswith('IDABAGUS') or nama_upper.startswith('IDAAYU') or \ any(nama_upper.startswith(p) for p in ['GUSTI', 'WAYAN', 'MADE', 'NYOMAN', 'KETUT', 'PUTU', 'KADEK', 'KOMANG']): is_bali = True if is_bali and result.get('nama'): result['nama'] = self._parse_balinese_name(result['nama']) # ============================================ # Validasi dan koreksi Agama # ============================================ if result.get('agama'): agama = result['agama'].upper().strip() # Fuzzy match terhadap daftar agama valid agama_match = None best_ratio = 0 for valid_agama in self.AGAMA_LIST: ratio = difflib.SequenceMatcher(None, agama, valid_agama.upper()).ratio() if ratio > best_ratio and ratio > 0.6: best_ratio = ratio agama_match = valid_agama.upper() if agama_match: if agama_match != agama: print(f" [AGAMA VALIDATE] '{agama}' -> '{agama_match}' (ratio={best_ratio:.2f})") result['agama'] = agama_match # Tidak ada fallback otomatis untuk agama - harus dari OCR # Fix merged kabupaten/kota names (e.g., JAKARTASELATAN -> JAKARTA SELATAN) if result['kabupaten_kota']: kk = result['kabupaten_kota'] # Add space before directional words kk = re.sub(r'(JAKARTA|BANDUNG|SURABAYA|SEMARANG|MEDAN|BEKASI|TANGERANG|DEPOK|BOGOR)(SELATAN|UTARA|BARAT|TIMUR|PUSAT|TENGAH)', r'\1 \2', kk, flags=re.IGNORECASE) # Common merged patterns kk = re.sub(r'(KOTA|KABUPATEN)([A-Z])', r'\1 \2', kk, flags=re.IGNORECASE) result['kabupaten_kota'] = kk.upper() # Fix merged provinsi names if result['provinsi']: prov = result['provinsi'] prov = re.sub(r'(DKI|DI)(JAKARTA|YOGYAKARTA)', r'\1 \2', prov, flags=re.IGNORECASE) prov = re.sub(r'(JAWA|KALIMANTAN|SULAWESI|SUMATERA|NUSA TENGGARA)(BARAT|TIMUR|TENGAH|SELATAN|UTARA)', r'\1 \2', prov, flags=re.IGNORECASE) result['provinsi'] = prov.upper() # Fix merged alamat/address (e.g., JLKECAPIV -> JL KECAPI V) if result['alamat']: alamat = result['alamat'] # Add space after common street prefixes alamat = re.sub(r'^(JL|JLN|JALAN|GG|GANG|NO|BLOK)([A-Z])', r'\1 \2', alamat, flags=re.IGNORECASE) # Add space before Roman numerals at the end (I, II, III, IV, V, VI, VII, VIII, IX, X) alamat = re.sub(r'([A-Z])([IVX]+)$', r'\1 \2', alamat, flags=re.IGNORECASE) # Add space before single digits/numbers at end alamat = re.sub(r'([A-Z])(\d+)$', r'\1 \2', alamat, flags=re.IGNORECASE) # Fix common patterns: "NO123" -> "NO 123", "BLOKA" -> "BLOK A" alamat = re.sub(r'\b(NO|BLOK)(\d+|[A-Z])\b', r'\1 \2', alamat, flags=re.IGNORECASE) result['alamat'] = alamat.upper() # ============================================ # Cross-validation: Tempat Lahir vs Kel/Desa # ============================================ # Pada KTP, tempat lahir sering sama dengan desa/kelurahan # Jika tempat_lahir mirip dengan kel_desa, gunakan yang tervalidasi if result.get('tempat_lahir') and result.get('kel_desa'): tl = result['tempat_lahir'].upper() kd = result['kel_desa'].upper() # Hitung similarity ratio = difflib.SequenceMatcher(None, tl, kd).ratio() if ratio > 0.7: # Tempat lahir mirip dengan kel/desa, gunakan kel/desa yang sudah divalidasi print(f" [CROSS-VALIDATE] Tempat Lahir '{tl}' mirip dengan Kel/Desa '{kd}' (ratio={ratio:.2f})") result['tempat_lahir'] = kd elif ratio > 0.5: # Cukup mirip, log untuk debugging print(f" [CROSS-VALIDATE] Tempat Lahir '{tl}' mungkin sama dengan Kel/Desa '{kd}' (ratio={ratio:.2f})") # Jika tempat_lahir kosong tapi kel_desa ada, mungkin sama # (tidak otomatis mengisi karena bisa beda) return result if __name__ == "__main__": # Test sample_ocr = [ {'text': 'PROVINSI JAWA BARAT'}, {'text': 'KABUPATEN BANDUNG'}, {'text': 'NIK : 3204012345678901'}, {'text': 'Nama : JOHN DOE'}, {'text': 'Tempat/Tgl Lahir : BANDUNG, 01-01-1990'}, {'text': 'Jenis Kelamin : LAKI-LAKI'}, {'text': 'Alamat : JL. MERDEKA NO. 123'}, {'text': 'RT/RW : 001/002'}, {'text': 'Kel/Desa : SUKAMAJU'}, {'text': 'Kecamatan : SUKASARI'}, {'text': 'Agama : ISLAM'}, {'text': 'Status Perkawinan : BELUM KAWIN'}, {'text': 'Pekerjaan : KARYAWAN SWASTA'}, {'text': 'Kewarganegaraan : WNI'}, {'text': 'Berlaku Hingga : SEUMUR HIDUP'}, ] extractor = KTPExtractor() result = extractor.extract(sample_ocr) for key, value in result.items(): print(f"{key}: {value}")