Files
local-ocr/ktp_extractor.py

1509 lines
71 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
KTP Field Extractor
Ekstraksi data terstruktur dari hasil OCR KTP Indonesia
Mendukung berbagai format output OCR (full-width colon, standard colon, tanpa colon)
OPTIMIZED: Pre-compiled regex patterns for better performance
"""
import re
from typing import Dict, Optional, List
import difflib
# Debug mode - set to False for production
DEBUG_MODE = False
class KTPExtractor:
"""Ekstrak field dari hasil OCR KTP"""
# Pre-compiled regex patterns (optimization)
COLON_PATTERN = re.compile(r'[:]')
NIK_PATTERN = re.compile(r'\b(\d{16})\b')
DATE_PATTERN = re.compile(r'(\d{2}[-/]\d{2}[-/]\d{4})')
RT_RW_PATTERN = re.compile(r'(\d{3})\s*/\s*(\d{3})')
GOL_DARAH_PATTERN = re.compile(r'([ABO]{1,2}[+\-]?)', re.IGNORECASE)
PROVINSI_SPLIT_PATTERN = re.compile(r'(?i)provinsi\s*')
KABUPATEN_SPLIT_PATTERN = re.compile(r'(?i)\s*(kabupaten|kota)\s*')
TTL_PATTERN = re.compile(r'(?i)tempat[/\s]*tgl[/\s]*lahir|tempat[/\s]*lahir|lahir')
# Pattern colon string (for backward compatibility)
COLON_PATTERN_STR = r'[:]'
# Daftar Provinsi Indonesia (38 Provinsi)
PROVINSI_LIST = [
"ACEH", "SUMATERA UTARA", "SUMATERA BARAT", "RIAU", "JAMBI", "SUMATERA SELATAN", "BENGKULU", "LAMPUNG",
"KEPULAUAN BANGKA BELITUNG", "KEPULAUAN RIAU", "DKI JAKARTA", "JAWA BARAT", "JAWA TENGAH", "DI YOGYAKARTA",
"JAWA TIMUR", "BANTEN", "BALI", "NUSA TENGGARA BARAT", "NUSA TENGGARA TIMUR", "KALIMANTAN BARAT",
"KALIMANTAN TENGAH", "KALIMANTAN SELATAN", "KALIMANTAN TIMUR", "KALIMANTAN UTARA", "SULAWESI UTARA",
"SULAWESI TENGAH", "SULAWESI SELATAN", "SULAWESI TENGGARA", "GORONTALO", "SULAWESI BARAT", "MALUKU",
"MALUKU UTARA", "PAPUA BARAT", "PAPUA", "PAPUA SELATAN", "PAPUA TENGAH", "PAPUA PEGUNUNGAN", "PAPUA BARAT DAYA"
]
# Keywords untuk jenis kelamin
MALE_KEYWORDS = ['laki', 'pria', 'male']
FEMALE_KEYWORDS = ['perempuan', 'wanita', 'female']
# Agama yang valid
AGAMA_LIST = ['islam', 'kristen', 'katolik', 'hindu', 'budha', 'buddha', 'konghucu']
# Pekerjaan umum
PEKERJAAN_LIST = ['pelajar', 'mahasiswa', 'pegawai', 'swasta', 'pns', 'wiraswasta',
'buruh', 'petani', 'nelayan', 'karyawan', 'ibu rumah tangga',
'tidak bekerja', 'lainnya', 'mengurus rumah tangga']
# Status Perkawinan yang valid
STATUS_PERKAWINAN_LIST = ['BELUM KAWIN', 'KAWIN', 'CERAI HIDUP', 'CERAI MATI']
# Field Labels untuk fuzzy matching (mengatasi typo OCR seperti "Aamat" -> "ALAMAT")
FIELD_LABELS = {
'nama': ['NAMA'],
'alamat': ['ALAMAT'],
'agama': ['AGAMA'],
'pekerjaan': ['PEKERJAAN'],
'kewarganegaraan': ['KEWARGANEGARAAN', 'WARGANEGARA'],
'tempat_lahir': ['TEMPAT', 'LAHIR', 'TEMPAT/TGL LAHIR'],
'jenis_kelamin': ['JENIS KELAMIN', 'JENIS', 'KELAMIN'],
'gol_darah': ['GOL. DARAH', 'GOL DARAH', 'GOLONGAN DARAH'],
'kel_desa': ['KEL/DESA', 'KELURAHAN', 'DESA'],
'kecamatan': ['KECAMATAN', 'KEC'],
'status_perkawinan': ['STATUS PERKAWINAN', 'PERKAWINAN'],
'berlaku_hingga': ['BERLAKU HINGGA', 'BERLAKU'],
'rt_rw': ['RT/RW', 'RT', 'RW'],
}
# ============================================
# Sistem Penamaan Hindu Bali
# ============================================
# Struktur: [Prefix Gender] + [Gelar Kasta] + [Penanda Gender] + [Urutan Lahir] + [Nama Pribadi]
# Prefix penanda gender (harus di awal nama)
BALI_GENDER_PREFIX = {
'NI': 'PEREMPUAN', # Prefix untuk perempuan
'I': 'LAKI-LAKI', # Prefix untuk laki-laki
}
# Gelar Kasta (setelah prefix gender)
BALI_KASTA = {
'IDA': 'BRAHMANA',
'GUSTI': 'KSATRIA',
'ANAK AGUNG': 'KSATRIA',
'COKORDA': 'KSATRIA',
'DEWA': 'KSATRIA',
'DESAK': 'KSATRIA',
'AGUNG': 'KSATRIA',
'NGAKAN': 'WAISYA',
'SANG': 'WAISYA',
'SI': 'WAISYA',
}
# Penanda gender tambahan (setelah kasta)
BALI_GENDER_MARKER = {
'AYU': 'PEREMPUAN',
'ISTRI': 'PEREMPUAN',
'LUH': 'PEREMPUAN',
'BAGUS': 'LAKI-LAKI',
'GEDE': 'LAKI-LAKI',
'AGUS': 'LAKI-LAKI',
'ALIT': 'LAKI-LAKI', # Kecil/muda (untuk laki-laki)
}
# Urutan kelahiran (bersiklus setiap 4 anak)
BALI_BIRTH_ORDER = {
'PUTU': 1, 'WAYAN': 1, 'GEDE': 1, 'ILUH': 1,
'MADE': 2, 'KADEK': 2, 'NENGAH': 2,
'NYOMAN': 3, 'KOMANG': 3,
'KETUT': 4,
'BALIK': 5, # Untuk anak ke-5+ (siklus ulang)
}
# Soroh/Klan Bali (identifikasi garis keturunan)
BALI_SOROH = {
'PASEK': 'SOROH', # Klan mayoritas (~60% Hindu Bali)
'PANDE': 'SOROH', # Klan pandai besi/metalurgi
'ARYA': 'SOROH', # Klan Arya
'BENDESA': 'SOROH', # Pemimpin adat
'TANGKAS': 'SOROH', # Klan Tangkas
'CELAGI': 'SOROH', # Klan Celagi
'SENGGUHU': 'SOROH', # Klan Sengguhu
'KUBAYAN': 'SOROH', # Klan Kubayan
'BANDESA': 'SOROH', # Varian Bendesa
}
# Gabungkan semua komponen untuk deteksi (urut dari panjang ke pendek)
BALI_NAME_COMPONENTS = [
# Prefix gender
'NI', 'I',
# Kasta (prioritas: yang lebih panjang dulu)
'ANAK AGUNG', 'COKORDA', 'NGAKAN',
'IDA', 'GUSTI', 'DEWA', 'DESAK', 'AGUNG', 'SANG', 'SI',
# Soroh/Klan
'PASEK', 'PANDE', 'ARYA', 'BENDESA', 'BANDESA', 'TANGKAS', 'CELAGI', 'SENGGUHU', 'KUBAYAN',
# Gender marker
'AYU', 'ISTRI', 'LUH', 'BAGUS', 'GEDE', 'AGUS', 'ALIT',
# Urutan lahir
'WAYAN', 'PUTU', 'ILUH', 'MADE', 'KADEK', 'NENGAH', 'NYOMAN', 'KOMANG', 'KETUT', 'BALIK',
]
# KTP Zone Template (normalized coordinates: x_min, y_min, x_max, y_max)
# Based on standard KTP layout
ZONES = {
'header_provinsi': (0.15, 0.00, 0.85, 0.07), # PROVINSI header
'header_kabupaten': (0.15, 0.05, 0.85, 0.13), # KABUPATEN header
'nik': (0.02, 0.10, 0.70, 0.22), # NIK area
'nama': (0.02, 0.18, 0.70, 0.28), # Nama area
'ttl': (0.02, 0.25, 0.70, 0.36), # Tempat/Tgl Lahir
'jenis_kelamin': (0.02, 0.33, 0.45, 0.42), # Jenis Kelamin (left)
'gol_darah': (0.40, 0.33, 0.70, 0.42), # Gol Darah (right of jenis)
'alamat': (0.02, 0.38, 0.70, 0.50), # Alamat
'rt_rw': (0.02, 0.46, 0.70, 0.54), # RT/RW
'kel_desa': (0.02, 0.51, 0.70, 0.60), # Kel/Desa
'kecamatan': (0.02, 0.57, 0.70, 0.66), # Kecamatan
'agama': (0.02, 0.63, 0.70, 0.72), # Agama
'status': (0.02, 0.69, 0.70, 0.78), # Status Perkawinan
'pekerjaan': (0.02, 0.75, 0.70, 0.84), # Pekerjaan
'wni': (0.02, 0.81, 0.70, 0.90), # Kewarganegaraan
'berlaku': (0.02, 0.87, 0.70, 0.96), # Berlaku Hingga
'foto': (0.68, 0.10, 0.98, 0.55), # Foto (right side)
'penerbitan': (0.65, 0.58, 0.98, 0.98), # Tempat & Tanggal penerbitan
}
def __init__(self):
self.image_width = 0
self.image_height = 0
def _get_zone(self, x_center: float, y_center: float, img_width: int, img_height: int) -> Optional[str]:
"""Determine which zone a text belongs to based on normalized coordinates"""
if img_width == 0 or img_height == 0:
return None
# Normalize coordinates
x_norm = x_center / img_width
y_norm = y_center / img_height
for zone_name, (x_min, y_min, x_max, y_max) in self.ZONES.items():
if x_min <= x_norm <= x_max and y_min <= y_norm <= y_max:
return zone_name
return None
def _extract_value_from_text(self, text: str) -> str:
"""Extract value part from label:value text"""
# Split by colon (standard or full-width)
parts = re.split(r'[:]', text, 1)
if len(parts) > 1:
return parts[1].strip()
return text.strip()
def _find_best_match(self, text: str, candidates: List[str], cutoff: float = 0.6) -> Optional[str]:
"""Find best fuzzy match from candidates"""
matches = difflib.get_close_matches(text, candidates, n=1, cutoff=cutoff)
return matches[0] if matches else None
def _is_label_match(self, text: str, field_name: str, cutoff: float = 0.7) -> bool:
"""
Fuzzy match untuk label field - mengatasi typo OCR seperti "Aamat" -> "ALAMAT"
Returns True jika text cocok dengan salah satu label untuk field tersebut
"""
if not text or not text.strip():
return False
if field_name not in self.FIELD_LABELS:
return field_name.lower() in text.lower()
text_upper = text.upper().strip()
# Explicit conflict prevention
if field_name == 'agama' and 'ALAMAT' in text_upper:
return False
if field_name == 'alamat' and 'AGAMA' in text_upper:
return False
# Coba exact match dulu (lebih cepat)
for label in self.FIELD_LABELS[field_name]:
if label in text_upper:
return True
# Fuzzy match jika tidak ada exact match
# Ekstrak kata pertama dari text (biasanya label ada di awal)
parts = text_upper.split(':')[0].split()
if not parts:
return False
first_word = parts[0]
for label in self.FIELD_LABELS[field_name]:
label_parts = label.split()
if not label_parts:
continue
# Bandingkan dengan kata pertama
ratio = difflib.SequenceMatcher(None, first_word, label_parts[0]).ratio()
# Dynamic cutoff logic
effective_cutoff = cutoff
if len(first_word) < 7:
# Use stricter cutoff for short words to prevent ALAMAT (6) matching AGAMA (5) -> ratio 0.73
effective_cutoff = max(cutoff, 0.82)
if ratio >= effective_cutoff:
if DEBUG_MODE:
print(f" [FUZZY LABEL] '{first_word}' matched '{label}' (ratio={ratio:.2f})")
return True
return False
def _extract_after_label(self, text: str, label_pattern: str) -> Optional[str]:
"""
Extract content after a label (fuzzy/regex match).
Handles cases with/without colons.
"""
if not text: return None
# 1. Try Regex Search if pattern provided
if label_pattern:
# Construct regex: Label + optional spaces/colon + (Group 1: Value)
# flags=re.IGNORECASE should be used
# We want to find the END of the label
match = re.search(f"({label_pattern})[:\\s]*", text, re.IGNORECASE)
if match:
# Return everything after the match end
return text[match.end():].strip()
return None
def _parse_balinese_name(self, name: str) -> str:
"""
Parse nama Bali yang digabung OCR dan tambahkan spasi yang tepat.
Contoh: "NIGUSTIAYUNYOMANSUWETRI" -> "NI GUSTI AYU NYOMAN SUWETRI"
Struktur nama Bali:
[Prefix Gender] + [Gelar Kasta] + [Penanda Gender] + [Urutan Lahir] + [Nama Pribadi]
PENTING: Hanya proses jika nama benar-benar mengandung komponen Bali!
"""
if not name:
return name
name_upper = name.upper().strip()
# Jika sudah ada spasi dengan jumlah wajar, kembalikan apa adanya
if name_upper.count(' ') >= 2:
return name_upper
# Cek apakah nama mengandung komponen Bali
# Nama harus dimulai dengan NI, I GUSTI, IDA, atau komponen urutan lahir Bali
name_clean = name_upper.replace(' ', '')
is_balinese_name = False
# Cek prefix khas Bali
if name_clean.startswith('NI') and len(name_clean) > 3:
# NI harus diikuti komponen Bali lain (GUSTI, LUH, WAYAN, dll)
after_ni = name_clean[2:]
for comp in ['GUSTI', 'LUH', 'WAYAN', 'MADE', 'NYOMAN', 'KETUT', 'PUTU', 'KADEK', 'KOMANG', 'PASEK', 'PANDE']:
if after_ni.startswith(comp):
is_balinese_name = True
break
elif name_clean.startswith('IGUSTI') or name_clean.startswith('IDABAGUS') or name_clean.startswith('IDAAYU'):
is_balinese_name = True
elif any(name_clean.startswith(p) for p in ['GUSTI', 'WAYAN', 'PUTU', 'MADE', 'KADEK', 'NYOMAN', 'KOMANG', 'KETUT']):
is_balinese_name = True
if not is_balinese_name:
# Bukan nama Bali, kembalikan dengan pemisahan spasi standar
# Jika ada 1 spasi, kembalikan apa adanya
if ' ' in name_upper:
return name_upper
# Jika tidak ada spasi sama sekali, kembalikan apa adanya (mungkin memang 1 kata)
return name_upper
# Urutan komponen yang akan dicari (dari yang terpanjang ke terpendek untuk akurasi)
components_ordered = sorted(self.BALI_NAME_COMPONENTS, key=len, reverse=True)
result_parts = []
remaining = name_clean
# Parse prefix gender (NI atau I di awal)
if remaining.startswith('NI'):
result_parts.append('NI')
remaining = remaining[2:]
elif remaining.startswith('I') and len(remaining) > 1:
# Pastikan bukan bagian dari kata lain
next_char = remaining[1] if len(remaining) > 1 else ''
# Cek apakah karakter setelah I adalah konsonan (bukan vokal)
if next_char not in 'AIUEO':
result_parts.append('I')
remaining = remaining[1:]
# Parse komponen-komponen lainnya
found = True
max_iterations = 10 # Prevent infinite loop
iteration = 0
while remaining and found and iteration < max_iterations:
found = False
iteration += 1
for component in components_ordered:
if remaining.startswith(component):
# Skip jika komponen sudah ada di result (kecuali nama pribadi)
if component not in result_parts or component not in self.BALI_NAME_COMPONENTS:
result_parts.append(component)
remaining = remaining[len(component):]
found = True
break
# Sisa adalah nama pribadi
if remaining:
result_parts.append(remaining)
parsed_name = ' '.join(result_parts)
# Log jika ada perubahan
if parsed_name != name_upper:
print(f" [BALI NAME] '{name_upper}' -> '{parsed_name}'")
return parsed_name
def _search_best_match_in_text(self, text: str, candidates: List[str], prefix: str = "") -> tuple:
"""
Search if any candidate is present in text using multiple strategies:
1. Exact substring
2. Prefix + Candidate (Fuzzy) - e.g. "PROVINSI BALI"
3. Candidate Only (Fuzzy) - e.g. "BALI" (if prefix is missing/damaged)
Returns (best_candidate, confidence_score)
"""
text_upper = text.upper()
best_match = None
best_ratio = 0.0
# Strategy 1: Exact substring match (fastest & most reliable)
for candidate in candidates:
if candidate in text_upper:
if len(candidate) > len(best_match or ""):
best_match = candidate
best_ratio = 1.0
if best_ratio == 1.0:
return best_match, best_ratio
# Strategy 2: Prefix Construction & Fuzzy Match
prefix_upper = prefix.upper() if prefix else ""
# DEBUG: Print checking (controlled by DEBUG_MODE)
if DEBUG_MODE:
print(f"DEBUG Check Text: '{text_upper}' with Prefix: '{prefix_upper}'")
for candidate in candidates:
# 2a. Compare with Prefix + Space (e.g. "PROVINSI BALI")
if prefix:
target_spaced = f"{prefix_upper} {candidate}"
s_spaced = difflib.SequenceMatcher(None, target_spaced, text_upper)
ratio_spaced = s_spaced.ratio()
# print(f" -> Compare '{target_spaced}' vs '{text_upper}' = {ratio_spaced:.2f}")
if ratio_spaced > best_ratio and ratio_spaced > 0.5:
best_ratio = ratio_spaced
best_match = candidate
# 2b. Compare with Prefix NO SPACE (e.g. "PROVINSIBALI")
# This handles "PROVNSIBALI" perfectly
target_merged = f"{prefix_upper}{candidate}"
s_merged = difflib.SequenceMatcher(None, target_merged, text_upper)
ratio_merged = s_merged.ratio()
if DEBUG_MODE:
print(f" -> Compare Merged '{target_merged}' vs '{text_upper}' = {ratio_merged:.2f}")
if ratio_merged > best_ratio and ratio_merged > 0.5:
best_ratio = ratio_merged
best_match = candidate
# 2c. Compare Candidate ONLY (e.g. "BALI")
if len(candidate) > 3:
s_raw = difflib.SequenceMatcher(None, candidate, text_upper)
ratio_raw = s_raw.ratio()
# print(f" -> Compare Raw '{candidate}' vs '{text_upper}' = {ratio_raw:.2f}")
if ratio_raw > best_ratio and ratio_raw > 0.6:
best_ratio = ratio_raw
best_match = candidate
if DEBUG_MODE:
print(f"DEBUG Best Match: {best_match} ({best_ratio:.2f})")
return best_match, best_ratio
def _detect_image_size(self, ocr_results: List[Dict]) -> tuple:
"""Detect image dimensions from bounding boxes"""
max_x, max_y = 0, 0
for r in ocr_results:
bbox = r.get('bbox', [])
if bbox and len(bbox) >= 4:
for point in bbox:
if len(point) >= 2:
max_x = max(max_x, point[0])
max_y = max(max_y, point[1])
# Add some margin
return (int(max_x * 1.05), int(max_y * 1.05)) if max_x > 0 else (1000, 640)
def _extract_by_zones(self, zone_texts: Dict[str, List[str]], result: Dict):
"""Extract fields based on zone assignments"""
# PROVINSI from header
if 'header_provinsi' in zone_texts:
print(f"DEBUG Zone Provinsi Content: {zone_texts['header_provinsi']}")
for text in zone_texts['header_provinsi']:
text_clean = text.strip()
# Use prefix strategy: "PROVINSI " + result vs text
match, score = self._search_best_match_in_text(text_clean, self.PROVINSI_LIST, prefix="PROVINSI")
# LOWER THRESHOLD to 0.5 because "PROVINSI BALI" vs "PROVNSIBALI" is roughly 0.5-0.6 range
if match and score > 0.5:
result['provinsi'] = match
# Remove the found province (and label) from text to see what's left
# If we matched "PROVINSI JAWA TIMUR", the text might be "PROVNSIJAWATMRKABUPATENSUMENEP"
# It's hard to cleanly remove "PROVISI JAWA TIMUR" if it was fuzzy matched.
# BUT, we can try to find "KABUPATEN" or "KOTA" in the original text
# independent of the province match
if 'kabupaten' in text_clean.lower() or 'kota' in text_clean.lower():
parts = re.split(r'(?i)\s*(kabupaten|kota)', text_clean)
if len(parts) > 1:
kab_part = "".join(parts[1:]).strip()
kab_val = re.sub(r'^(?i)(kabupaten|kota)\s*', '', kab_part).strip()
if kab_val and result['kabupaten_kota'] is None:
prefix = "KABUPATEN" if "kabupaten" in text_clean.lower() else "KOTA"
result['kabupaten_kota'] = f"{prefix} {kab_val.upper()}"
break
# Fallback to keyword splitting (Legacy/Blurry fallback)
text_lower = text.lower()
val = text
# If keyword exists, strip it
if 'provinsi' in text_lower:
split_prov = re.split(r'(?i)provinsi\s*', text, 1)
if len(split_prov) > 1:
val = split_prov[1].strip()
else:
val = ""
# Check for merged text
if 'kabupaten' in text_lower or 'kota' in text_lower:
parts = re.split(r'(?i)\s*(kabupaten|kota)', val)
val = parts[0].strip()
if len(parts) > 1:
kab_part = "".join(parts[1:]).strip()
kab_val = re.sub(r'^(?i)(kabupaten|kota)\s*', '', kab_part).strip()
if kab_val and result['kabupaten_kota'] is None:
prefix = "KABUPATEN" if "kabupaten" in text_lower else "KOTA"
result['kabupaten_kota'] = f"{prefix} {kab_val.upper()}"
if val and len(val) > 2:
# Try fuzzy match again on the cleaned value
best_match = self._find_best_match(val.upper(), self.PROVINSI_LIST, cutoff=0.6)
if best_match:
result['provinsi'] = best_match
else:
result['provinsi'] = val.upper()
break
# KABUPATEN/KOTA from header
if 'header_kabupaten' in zone_texts:
for text in zone_texts['header_kabupaten']:
text_lower = text.lower()
val = text
# Check keyword
if 'kabupaten' in text_lower or 'kota' in text_lower:
split_kab = re.split(r'(?i)\s*(kabupaten|kota)\s*', text, 1)
if len(split_kab) > 1:
val = split_kab[-1].strip()
else:
val = ""
# If no keyword, but it's in the kabupaten zone, assume it's data
if val:
# Re-add prefix standard if we separated it or if it was missing
# Heuristic: if validation suggests it's a known regency, we are good.
# For now, standardize format.
if result['kabupaten_kota'] is None:
prefix = "KABUPATEN" if "kabupaten" in text_lower else "KOTA"
# If no keyword found, default to KABUPATEN? Or better check Wilayah?
# Let's default to detected keyword or KABUPATEN
if "kota" in text_lower:
prefix = "KOTA"
else:
prefix = "KABUPATEN"
result['kabupaten_kota'] = f"{prefix} {val.upper()}"
break
# NAMA from nama zone (skip label line)
if 'nama' in zone_texts:
for text in zone_texts['nama']:
text_lower = text.lower()
if 'nama' not in text_lower and len(text) > 2:
result['nama'] = text.upper()
break
elif 'nama' in text_lower:
val = self._extract_value_from_text(text)
if val and 'nama' not in val.lower():
result['nama'] = val.upper()
# TTL from ttl zone
if 'ttl' in zone_texts:
for text in zone_texts['ttl']:
# Skip if text is JUST the label (length check or fuzzy match)
if len(text) < 15 and self._is_label_match(text, 'tempat_lahir'):
continue
if 'tempat' in text.lower() or 'lahir' in text.lower() or 'tgl' in text.lower() or len(text) > 5:
val = self._extract_value_from_text(text)
if val:
# Don't accept if val looks like label
if self._is_label_match(val, 'tempat_lahir') and len(val) < 20:
continue
self._parse_ttl(val, result)
# Only break if we actually got a birth date, otherwise keep looking
if result['tanggal_lahir']:
break
# JENIS KELAMIN
if 'jenis_kelamin' in zone_texts:
for text in zone_texts['jenis_kelamin']:
text_lower = text.lower()
if 'laki' in text_lower:
result['jenis_kelamin'] = 'LAKI-LAKI'
break
elif 'perempuan' in text_lower or 'wanita' in text_lower:
result['jenis_kelamin'] = 'PEREMPUAN'
break
# GOL DARAH
if 'gol_darah' in zone_texts:
for text in zone_texts['gol_darah']:
gol_match = re.search(r'([ABO]{1,2}[+\-]?)', text, re.IGNORECASE)
if gol_match:
result['gol_darah'] = gol_match.group(1).upper()
break
# ALAMAT
if 'alamat' in zone_texts:
for text in zone_texts['alamat']:
if 'alamat' not in text.lower() or len(zone_texts['alamat']) == 1:
val = self._extract_value_from_text(text) if 'alamat' in text.lower() else text
if val and 'alamat' not in val.lower():
result['alamat'] = val.upper()
break
# RT/RW
if 'rt_rw' in zone_texts:
for text in zone_texts['rt_rw']:
rt_rw_match = re.search(r'(\d{3})\s*/\s*(\d{3})', text)
if rt_rw_match:
result['rt_rw'] = f"{rt_rw_match.group(1)}/{rt_rw_match.group(2)}"
break
# KEL/DESA
if 'kel_desa' in zone_texts:
for text in zone_texts['kel_desa']:
if 'kel' in text.lower() or 'desa' in text.lower():
val = self._extract_value_from_text(text)
if val and 'kel' not in val.lower():
result['kel_desa'] = val.upper()
break
elif result['kel_desa'] is None:
# Fallback context: simple text
result['kel_desa'] = text.upper()
# KECAMATAN
if 'kecamatan' in zone_texts:
for text in zone_texts['kecamatan']:
if 'kec' in text.lower():
val = self._extract_value_from_text(text)
if val and 'kec' not in val.lower():
result['kecamatan'] = val.upper()
break
elif result['kecamatan'] is None:
result['kecamatan'] = text.upper()
# AGAMA
if 'agama' in zone_texts:
for text in zone_texts['agama']:
val = text.upper()
if 'agama' in text.lower():
val = self._extract_value_from_text(text).upper()
# Verify against valid list
for agama in self.AGAMA_LIST:
if agama.upper() in val:
result['agama'] = agama.upper()
break
if result['agama']: break
# STATUS PERKAWINAN
if 'status' in zone_texts:
for text in zone_texts['status']:
val = text.upper()
# Normalize common OCR errors (e.g. BELUMKAWIN)
val = val.replace("BELUMKAWIN", "BELUM KAWIN")
# Check against official list
found_status = False
for status in self.STATUS_PERKAWINAN_LIST:
if status in val:
result['status_perkawinan'] = status
found_status = True
break
if found_status: break
# PEKERJAAN
if 'pekerjaan' in zone_texts:
best_job = None
potential_job = None
for text in zone_texts['pekerjaan']:
val = text.upper()
if 'pekerjaan' in text.lower():
val = self._extract_value_from_text(text).upper()
# Clean up
val = val.strip()
if not val or len(val) < 3 or 'PEKERJAAN' in val:
continue
# 1. Check against wildcard/list (Priority)
# Buruh, Karyawan, Pelajar, dll
if any(job.upper() in val for job in self.PEKERJAAN_LIST):
best_job = val
break # Found a definitive job
# 2. Save as potential if it's NOT a known bad value (like City names)
# Avoid capturing 'TABANAN', 'JAKARTA', date strings
if not any(city in val for city in ['KABUPATEN', 'KOTA', 'TABANAN', 'BADUNG', 'DENPASAR', 'JAKARTA', 'BANDUNG']):
if not re.search(r'\d{2}-\d{2}-\d{4}', val): # Avoid dates
if potential_job is None:
potential_job = val
if best_job:
result['pekerjaan'] = best_job
elif potential_job:
result['pekerjaan'] = potential_job
# WNI
if 'wni' in zone_texts:
for text in zone_texts['wni']:
if 'wni' in text.lower():
result['kewarganegaraan'] = 'WNI'
break
elif 'wna' in text.lower():
result['kewarganegaraan'] = 'WNA'
break
# PENERBITAN area (tempat & tanggal dalam satu zona)
if 'penerbitan' in zone_texts:
for text in zone_texts['penerbitan']:
# Look for date
date_match = re.search(r'(\d{2}[-/]\d{2}[-/]\d{4})', text)
if date_match and result['tanggal_penerbitan'] is None:
result['tanggal_penerbitan'] = date_match.group(1)
def extract(self, ocr_results: List[Dict]) -> Dict[str, Optional[str]]:
"""
Ekstrak field KTP dari hasil OCR dengan template-based zone detection
Args:
ocr_results: List hasil dari OCREngine.extract_text()
Returns:
Dict dengan field KTP
"""
result = {
'nik': None,
'nama': None,
'tempat_lahir': None,
'tanggal_lahir': None,
'jenis_kelamin': None,
'gol_darah': None,
'alamat': None,
'rt_rw': None,
'kel_desa': None,
'kecamatan': None,
'agama': None,
'status_perkawinan': None,
'pekerjaan': None,
'kewarganegaraan': None,
'berlaku_hingga': 'SEUMUR HIDUP', # Default sesuai peraturan pemerintah e-KTP
'provinsi': None,
'kabupaten_kota': None,
'tanggal_penerbitan': None,
}
# Detect image dimensions from bounding boxes
img_width, img_height = self._detect_image_size(ocr_results)
# Assign zones to each OCR result
zone_texts = {} # zone_name -> list of texts
for r in ocr_results:
x_center = r.get('x_center', 0)
y_center = r.get('y_center', 0)
zone = self._get_zone(x_center, y_center, img_width, img_height)
if zone:
if zone not in zone_texts:
zone_texts[zone] = []
zone_texts[zone].append(r['text'])
# Debug: print zone assignments
print("\n[DEBUG KTPExtractor] Zone assignments:")
for zone, texts in zone_texts.items():
print(f" {zone}: {texts}")
# Extract fields using zone-based approach
self._extract_by_zones(zone_texts, result)
# Gabungkan semua teks untuk fallback pattern matching
texts = [r['text'].strip() for r in ocr_results]
all_text = '\n'.join(texts)
# Ekstrak NIK (16 digit) - bisa ada di mana saja
nik_match = re.search(r'\b(\d{16})\b', all_text)
if nik_match:
result['nik'] = nik_match.group(1)
print(f" -> NIK found: {result['nik']}")
# Fallback: Parse line by line for fields not found by zone
for i, text in enumerate(texts):
# Skip baris yang hanya berisi punctuation atau kosong
text_stripped = text.strip()
if not text_stripped or text_stripped in [':', '', '.', '-', '/', '|']:
continue
# Skip baris yang terlalu pendek (hanya 1-2 karakter non-alfanumerik)
if len(text_stripped) <= 2 and not any(c.isalnum() for c in text_stripped):
continue
text_lower = text.lower()
# Normalize colons
text_normalized = re.sub(self.COLON_PATTERN, ':', text)
text_norm_lower = text_normalized.lower()
# ===== PROVINSI =====
if 'provinsi' in text_lower and result['provinsi'] is None:
# Split by PROVINSI and take remainder
split_prov = re.split(r'(?i)provinsi\s*', text, 1)
if len(split_prov) > 1:
val = split_prov[1].strip()
# Check if it contains kabupaten/kota (merged line case)
if 'kabupaten' in val.lower() or 'kota' in val.lower():
parts = re.split(r'(?i)\s*(kabupaten|kota)', val)
val = parts[0].strip()
if val:
# Fuzzy match against valid provinces
best_match = self._find_best_match(val.upper(), self.PROVINSI_LIST, cutoff=0.6)
if best_match:
result['provinsi'] = best_match
else:
result['provinsi'] = val.upper()
# Check for next line if current line only had 'PROVINSI'
if result['provinsi'] is None and i + 1 < len(texts):
next_text = texts[i+1].strip()
next_lower = next_text.lower()
# Only take next line if it doesn't look like another field
if not any(kw in next_lower for kw in ['provinsi', 'kabupaten', 'kota', 'nik']):
# Fuzzy match next line
val = next_text.upper()
best_match = self._find_best_match(val, self.PROVINSI_LIST, cutoff=0.6)
if best_match:
result['provinsi'] = best_match
else:
result['provinsi'] = val
# ===== KABUPATEN/KOTA =====
if ('kabupaten' in text_lower or 'kota' in text_lower or 'jakarta' in text_lower) and result['kabupaten_kota'] is None:
if 'provinsi' not in text_lower: # Bukan bagian dari provinsi
# Split by KABUPATEN or KOTA and take remainder
split_kab = re.split(r'(?i)\s*(kabupaten|kota)\s*', text, 1)
if len(split_kab) > 1:
prefix = "KABUPATEN" if "kabupaten" in text_lower else "KOTA"
val = split_kab[-1].strip()
if val:
result['kabupaten_kota'] = f"{prefix} {val.upper()}"
else:
result['kabupaten_kota'] = text.strip().upper()
else:
result['kabupaten_kota'] = text.strip().upper()
# ===== NAMA =====
if result['nama'] is None and self._is_label_match(text, 'nama'):
val = self._extract_after_label(text_normalized, 'nama')
current_name = ""
if val:
current_name = val.upper()
# Loop check baris berikutnya for Name (handle 2-3 lines)
offset = 1
# Batasi maksimal 2 baris tambahan untuk Nama (total 3 baris)
while i + offset < len(texts) and offset <= 2:
next_text = texts[i+offset].strip()
next_lower = next_text.lower()
is_stop = False
# 1. Check Stop Keywords (Field Labels below Name)
# Stop if next line is Tempat Lahir, Jenis Kelamin, Alamat, etc.
stop_keywords = ['tempat', 'lahir', 'tgl', 'jenis', 'kelamin', 'alamat', 'rt/rw', 'nik']
if any(kw in next_lower for kw in stop_keywords):
is_stop = True
print(f" [NAMA STOP] Matched stop keyword in '{next_text}'")
# 2. Check Case Sensitivity (Heuristic)
if not is_stop:
letters = [c for c in next_text if c.isalpha()]
if letters:
upper_count = sum(1 for c in letters if c.isupper())
upper_ratio = upper_count / len(letters)
# If mostly lowercase/title case, likely a label (e.g. "Tempat Lahir")
if upper_ratio < 0.4 and len(letters) > 3:
is_stop = True
print(f" [NAMA STOP] Likely Label based on Case (Ratio={upper_ratio:.2f})")
if not is_stop:
if len(next_text) > 2:
print(f" [NAMA MERGE] Merging '{next_text}'")
if current_name:
current_name += " " + next_text.upper()
else:
current_name = next_text.upper()
offset += 1
else:
print(f" [NAMA SKIP] Too short '{next_text}'")
# Kalau terlalu pendek (noise), boleh skip atau stop?
# Biasanya nama tidak putus jadi 1 huruf. Anggap stop utk aman, atau skip.
# Kita skip saja increment offset.
offset += 1
else:
break
if current_name:
# Fix Spacing Issues (e.g. BAGUSGEDE -> BAGUS GEDE)
current_name = re.sub(r'(BAGUS)(GEDE)', r'\1 \2', current_name)
current_name = re.sub(r'(ANAK)(AGUNG)', r'\1 \2', current_name) # Common issue
result['nama'] = current_name
# ===== TEMPAT/TANGGAL LAHIR =====
# ... (starts around line 830 in original) ...
# (Skipping down to ALAMAT section for the replacement block)
# ... regex find ...
# ===== ALAMAT ===== (dengan fuzzy label matching)
if result['alamat'] is None and self._is_label_match(text, 'alamat'):
val = self._extract_after_label(text_normalized, r'a{1,2}l{0,2}a?m{0,2}a?t')
# Logic multi-line
current_addr = ""
if val:
current_addr = val.upper()
# Loop check baris berikutnya (bisa ambil i+1, i+2, dst selama bukan label)
offset = 1
while i + offset < len(texts):
next_text = texts[i+offset].strip()
print(f" [ALAMAT CHECK] Offset +{offset}: '{next_text}'")
next_lower = next_text.lower()
is_stop = False
# 1. Cek Pola RT/RW (angka/angka) -> Pasti STOP
if re.search(r'\d{3}\s*/\s*\d{3}', next_text) or re.match(r'^[.\-]+\s*/\s*[.\-]+$', next_text):
is_stop = True
print(" [ALAMAT STOP] Matched RT/RW pattern")
# 2. Cek Keywords Label Pembatas
elif any(next_lower.startswith(prefix) for prefix in ['rt/', 'rw', 'rt/rw', 'kel', 'desa', 'kec', 'agama', 'status', 'kawin']):
is_stop = True
print(" [ALAMAT STOP] Matched label prefix")
# 3. Cek Keywords Spesifik Full Word
elif any(kw in next_lower for kw in ['kelurahan', 'kecamatan', 'perkawinan', 'kewarganegaraan']):
is_stop = True
print(" [ALAMAT STOP] Matched distinct label word")
# 4. Check Case Sensitivity
if not is_stop:
letters = [c for c in next_text if c.isalpha()]
if letters:
upper_count = sum(1 for c in letters if c.isupper())
upper_ratio = upper_count / len(letters)
# Jika hampir semua huruf kecil/Title Case (ratio < 0.4), dicurigai sebagai Label
# Kecuali kata-kata pendek (< 5 chars)
if upper_ratio < 0.4 and len(letters) > 4:
is_stop = True
print(f" [ALAMAT STOP] Detected Title Case/Lowercase (Ratio={upper_ratio:.2f}) -> Likely Label")
# Jika BUKAN pembatas, AMBIL sebagai lanjutan alamat
if not is_stop:
if len(next_text) > 1:
print(f" [ALAMAT MERGE] Merging '{next_text}'")
if current_addr:
current_addr += " " + next_text.upper()
else:
current_addr = next_text.upper()
offset += 1 # Lanjut cek baris berikutnya
else:
print(f" [ALAMAT SKIP] Line too short '{next_text}'")
offset += 1 # Skip noise, try next line? Or stop? usually skip noise is safer to continue
else:
print(f" [ALAMAT STOP] Hit Stop Condition '{next_text}'")
break # Stop loop
if current_addr:
result['alamat'] = current_addr
if current_addr:
result['alamat'] = current_addr
# ===== RT/RW =====
# Relaxed pattern to handle -/- or 000/000
if result['rt_rw'] is None:
rt_rw_match = re.search(r'(\d{1,3}|-)\s*/\s*(\d{1,3}|-)', text)
if rt_rw_match:
result['rt_rw'] = f"{rt_rw_match.group(1)}/{rt_rw_match.group(2)}"
print(f" [RT/RW] Found {result['rt_rw']}")
# ===== KELURAHAN/DESA =====
if ('kel' in text_lower or 'desa' in text_lower) and 'kelamin' not in text_lower:
if result['kel_desa'] is None:
val = self._extract_after_label(text_normalized, 'kel|desa')
if val:
result['kel_desa'] = val.upper()
elif i + 1 < len(texts):
result['kel_desa'] = texts[i+1].strip().upper()
# ===== TEMPAT/TANGGAL LAHIR =====
# Gunakan _is_label_match untuk fleksibilitas (e.g. Tempat/Tgl Lahir, Tmpt Lahir)
if result['tempat_lahir'] is None and self._is_label_match(text, 'ttl'):
print(f" [TTL DEBUG] Matched Label on line {i}: '{text}'")
# Regex pattern yang SANGAT fleksibel untuk label TTL
# Menangani berbagai variasi: Tmpat/Tgl Lahir, Tempat. Tgl. Lahir, dll
# Intinya: T...mp...t <junk> L...hir
val = self._extract_after_label(text_normalized, r't[ea]m?p?a?t.*?l[a@]hi?r?|tgl.*?l[a@]hi?r?')
# Jika val kosong, coba ambil dari baris berikutnya
if not val and i + 1 < len(texts):
next_text = texts[i+1].strip()
next_lower = next_text.lower()
stop_keywords = ['jenis', 'kelamin', 'alamat', 'gol', 'darah']
if not any(kw in next_lower for kw in stop_keywords):
val = next_text.upper()
print(f" [TTL DEBUG] Took next line: '{val}'")
if val:
print(f" [TTL DEBUG] Parsing value: '{val}'")
self._parse_ttl(val, result)
if result['tanggal_lahir']:
print(f" [TTL DEBUG] Success: {result['tanggal_lahir']}")
# ===== JENIS KELAMIN =====
if result['jenis_kelamin'] is None:
# 1. Coba cari Label dulu
if self._is_label_match(text, 'jenis_kelamin'):
val = self._extract_after_label(text_normalized, r'j[ea]ni?s\s*k[ea]l[a@]?mi?n')
if val:
if 'LAKI' in val.upper(): result['jenis_kelamin'] = 'LAKI-LAKI'
elif 'PEREMPUAN' in val.upper() or 'WANITA' in val.upper(): result['jenis_kelamin'] = 'PEREMPUAN'
if result['jenis_kelamin'] is None and i + 1 < len(texts):
next_text = texts[i+1].upper()
if 'LAKI' in next_text: result['jenis_kelamin'] = 'LAKI-LAKI'
elif 'PEREMPUAN' in next_text or 'WANITA' in next_text: result['jenis_kelamin'] = 'PEREMPUAN'
# 2. Fallback: Cari langsung keyword VALUES
if result['jenis_kelamin'] is None:
text_upper = text.upper()
if 'LAKI-LAKI' in text_upper or 'LAKI - LAKI' in text_upper:
result['jenis_kelamin'] = 'LAKI-LAKI'
elif 'PEREMPUAN' in text_upper:
result['jenis_kelamin'] = 'PEREMPUAN'
# ===== GOLONGAN DARAH =====
if result['gol_darah'] is None:
# Cek label
if self._is_label_match(text, 'gol_darah'):
val = self._extract_after_label(text_normalized, r'g?o?l\.?\s*d?a?r?a?h')
# Jika label ketemu tapi val kosong, mungkin nempel (Gol.Darah : O)
# atau ada di baris ini
if val:
gd_match = re.search(r'([ABO]{1,2}[+\-]?)', val)
if gd_match:
result['gol_darah'] = gd_match.group(1).upper()
else:
# Coba cari pattern gol darah di baris yang sama dengan label
gd_match = re.search(r'([ABO]{1,2}[+\-]?)', text.upper().replace('0','O'))
if gd_match:
result['gol_darah'] = gd_match.group(1).upper()
# Cek next line jika baris ini cuma label "Gol Darah"
if result['gol_darah'] is None and self._is_label_match(text, 'gol_darah') and i+1 < len(texts):
next_text = texts[i+1].strip().upper()
if len(next_text) < 5: # Pendek, asumsi gol darah
gd_match = re.search(r'([ABO]{1,2}[+\-]?)', next_text)
if gd_match:
result['gol_darah'] = gd_match.group(1).upper()
# ===== KECAMATAN =====
if 'kecamatan' in text_lower or ('kec' in text_lower and 'kelamin' not in text_lower):
if result['kecamatan'] is None:
val = self._extract_after_label(text_normalized, 'kecamatan|kec')
if val:
result['kecamatan'] = val.upper()
elif i + 1 < len(texts):
# Value on next line (real KTP pattern)
next_text = texts[i+1].strip()
if len(next_text) > 2 and not any(kw in next_text.lower() for kw in ['agama', 'status', 'pekerjaan']):
result['kecamatan'] = next_text.upper()
# ===== AGAMA ===== (dengan fuzzy label matching)
if self._is_label_match(text, 'agama'):
val = self._extract_after_label(text_normalized, r'a?g{0,2}a?m{0,2}a')
if val and result['agama'] is None:
result['agama'] = val.upper()
elif result['agama'] is None and i + 1 < len(texts):
# Value on next line (real KTP pattern)
next_text = texts[i+1].strip().upper()
if next_text in ['ISLAM', 'KRISTEN', 'KATOLIK', 'HINDU', 'BUDHA', 'BUDDHA', 'KONGHUCU']:
result['agama'] = next_text
else:
# Check if line contains only agama name
for agama in self.AGAMA_LIST:
if agama in text_lower and len(text) < 20:
if result['agama'] is None:
result['agama'] = text.strip().upper()
break
# ===== STATUS PERKAWINAN =====
if 'kawin' in text_lower:
if result['status_perkawinan'] is None:
# Check against official list first
text_upper = text.upper().replace("BELUMKAWIN", "BELUM KAWIN")
for status in self.STATUS_PERKAWINAN_LIST:
if status in text_upper:
result['status_perkawinan'] = status
break
# Fallback to extraction if not found in list
if result['status_perkawinan'] is None:
val = self._extract_after_label(text_normalized, 'status.*kawin|perkawinan')
if val:
result['status_perkawinan'] = val.upper()
# ===== PEKERJAAN =====
if 'pekerjaan' in text_lower:
val = self._extract_after_label(text_normalized, 'pekerjaan')
if val and result['pekerjaan'] is None:
result['pekerjaan'] = val.upper()
elif result['pekerjaan'] is None and i + 1 < len(texts):
# Value on next line (real KTP pattern)
next_text = texts[i+1].strip()
if len(next_text) > 2 and 'kewarganegaraan' not in next_text.lower():
result['pekerjaan'] = next_text.upper()
else:
# Check if line contains pekerjaan keyword
for pekerjaan in self.PEKERJAAN_LIST:
if pekerjaan in text_lower and len(text) < 30:
if result['pekerjaan'] is None:
result['pekerjaan'] = text.strip().upper()
break
# ===== KEWARGANEGARAAN =====
if 'wni' in text_lower:
result['kewarganegaraan'] = 'WNI'
elif 'wna' in text_lower:
result['kewarganegaraan'] = 'WNA'
elif 'warga' in text_lower and result['kewarganegaraan'] is None:
val = self._extract_after_label(text_normalized, 'kewarganegaraan|warga')
if val:
result['kewarganegaraan'] = val.upper()
# ===== BERLAKU HINGGA =====
if 'berlaku' in text_lower or 'seumur' in text_lower:
if result['berlaku_hingga'] is None:
if 'seumur' in text_lower or 'hidup' in text_lower:
result['berlaku_hingga'] = 'SEUMUR HIDUP'
else:
val = self._extract_after_label(text_normalized, 'berlaku')
if val:
result['berlaku_hingga'] = val.upper()
# ===== TANGGAL PENERBITAN (biasanya format DD-MM-YYYY di akhir) =====
# Look for date that is NOT tanggal lahir (different date)
if result['tanggal_penerbitan'] is None:
# 1. Skip if contains Keywords of other date fields
# Jangan ambil jika ada kata 'LAHIR', 'TGL', 'BERLAKU', 'SEUMUR', 'HINGGA'
line_clean = text.lower()
if any(kw in line_clean for kw in ['lahir', 'lahlr', 'tgl', 'tempat', 'berlaku', 'seumur', 'hingga', 'hidup']):
pass # Skip
else:
# Match date format at end of text or standalone date
date_match = re.search(r'(\d{2}[-\s/]\d{2}[-\s/]\d{4})$', text.strip())
if date_match:
found_date = date_match.group(1).replace(' ', '-')
# Make sure it's not the same as tanggal_lahir
if result['tanggal_lahir'] != found_date:
# Strict Position Check: MUST be in the bottom 30% of lines
# (Untuk menghindari salah ambil tanggal lahir yg mungkin gagal diparsing sbg TTL)
if i > len(texts) * 0.7:
result['tanggal_penerbitan'] = found_date
print(f" [TGL TERBIT] Found '{found_date}' at index {i}/{len(texts)}")
else:
print(f" [TGL TERBIT SKIP] Date '{found_date}' is too high ({i}/{len(texts)})")
# ============================================
# AGGRESSIVE SCAN: Cari agama dari semua teks OCR
# ============================================
# Indonesia hanya punya 6 agama resmi, mudah dideteksi
if result['agama'] is None:
# Daftar agama dengan variasi penulisan
agama_patterns = {
'ISLAM': ['ISLAM', 'ISLM', 'ISIAM', 'ISLAMI'],
'KRISTEN': ['KRISTEN', 'KRISTEN PROTESTAN', 'PROTESTAN', 'KRISTN'],
'KATOLIK': ['KATOLIK', 'KATHOLIK', 'KATHOLK', 'KATOLIK ROMA', 'KATOLIK.'],
'HINDU': ['HINDU', 'HNDU', 'HINDU DHARMA', 'HINDHU'],
'BUDDHA': ['BUDDHA', 'BUDHA', 'BUDDA', 'BUDDHIS'],
'KONGHUCU': ['KONGHUCU', 'KHONGHUCU', 'KONGHUCHU', 'CONFUCIUS'],
}
for text in texts:
text_upper = text.upper().strip()
# Skip jika teks terlalu pendek atau terlalu panjang
if len(text_upper) < 4 or len(text_upper) > 30:
continue
for agama_std, variants in agama_patterns.items():
for variant in variants:
if variant in text_upper:
result['agama'] = agama_std
print(f" [AGAMA SCAN] Found '{variant}' in '{text_upper}' -> {agama_std}")
break
if result['agama']:
break
if result['agama']:
break
# ============================================
# AGGRESSIVE SCAN: Cari golongan darah dari semua teks OCR
# ============================================
# Golongan darah hanya 4: A, B, AB, O (dengan/tanpa rhesus +/-)
if result['gol_darah'] is None:
gol_darah_patterns = ['AB+', 'AB-', 'A+', 'A-', 'B+', 'B-', 'O+', 'O-', 'AB', 'A', 'B', 'O']
for text in texts:
text_upper = text.upper().strip()
# Hapus punctuation umum
text_clean = re.sub(r'[:\.\,\s]+', '', text_upper)
# Konversi 0 (nol) menjadi O (huruf) - OCR sering salah baca
text_clean = text_clean.replace('0', 'O')
# Skip jika teks terlalu panjang (bukan gol darah)
if len(text_clean) > 10:
continue
# Cari match untuk gol darah (dari panjang ke pendek untuk prioritas AB sebelum A/B)
for gol in gol_darah_patterns:
# Exact match setelah dibersihkan
if text_clean == gol:
result['gol_darah'] = gol
print(f" [GOL DARAH SCAN] Found '{text_upper}' -> {gol}")
break
# Match dengan prefix GOL
if text_clean == f"GOL{gol}" or text_clean == f"GOLDARAH{gol}":
result['gol_darah'] = gol
print(f" [GOL DARAH SCAN] Found '{text_upper}' -> {gol}")
break
# Match sebagai single character di akhir teks pendek
if len(text_clean) <= 3 and text_clean.endswith(gol):
result['gol_darah'] = gol
print(f" [GOL DARAH SCAN] Found '{text_upper}' -> {gol}")
break
if result['gol_darah']:
break
# ============================================
# AGGRESSIVE SCAN: Cari berlaku hingga dari semua teks OCR
# ============================================
if result['berlaku_hingga'] is None:
for text in texts:
text_upper = text.upper().strip()
if 'SEUMUR' in text_upper or 'HIDUP' in text_upper:
result['berlaku_hingga'] = 'SEUMUR HIDUP'
print(f" [BERLAKU SCAN] Found '{text_upper}' -> SEUMUR HIDUP")
break
# Post-processing
result = self._post_process(result)
return result
def _extract_after_label(self, text: str, label_pattern: str) -> Optional[str]:
"""Ekstrak nilai setelah label (supports various separators)"""
patterns = [
rf'(?:{label_pattern})\s*:\s*(.+)', # label: value
rf'(?:{label_pattern})\s+([A-Z0-9].+)', # label VALUE (uppercase start)
]
for pattern in patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
value = match.group(1).strip()
# Remove trailing colon or label fragment
value = re.sub(r'^[:\s]+', '', value)
value = re.sub(r'\s*:\s*$', '', value)
if value and len(value) > 1:
return value
return None
def _parse_ttl(self, ttl_text: str, result: Dict):
"""Parse tempat/tanggal lahir dari text"""
ttl_text = ttl_text.strip()
# Normalize dates where OCR missed dashes:
# "05 08 1978" -> "05-08-1978"
# "05 08-1978" -> "05-08-1978"
# "05-08 1978" -> "05-08-1978"
ttl_text = re.sub(r'(\d{2})[\s]+(\d{2})[\s]+(\d{4})', r'\1-\2-\3', ttl_text)
ttl_text = re.sub(r'(\d{2})[\s]+(\d{2})[-/](\d{4})', r'\1-\2-\3', ttl_text)
ttl_text = re.sub(r'(\d{2})[-/](\d{2})[\s]+(\d{4})', r'\1-\2-\3', ttl_text)
# Handle "0508-1978" -> "05-08-1978" (Missing separator between day/month)
ttl_text = re.sub(r'(\d{2})(\d{2})[-/](\d{4})', r'\1-\2-\3', ttl_text)
# Handle 8-digit date without separator: "05081978" -> "05-08-1978"
date_8digit = re.search(r'(\d{8})', ttl_text)
if date_8digit:
d = date_8digit.group(1)
formatted = f"{d[:2]}-{d[2:4]}-{d[4:]}"
ttl_text = ttl_text.replace(d, formatted)
# Handle merged city+date like "JAKARTA05-08-1978" - add space before digits
ttl_text = re.sub(r'([A-Z])(\d{2}[-/])', r'\1 \2', ttl_text, flags=re.IGNORECASE)
# Handle merged city+date like "JAKARTA.05-08-1978" -> replace dot with space
ttl_text = re.sub(r'([A-Z])\.(\d)', r'\1 \2', ttl_text, flags=re.IGNORECASE)
# Format: "TEMPAT, DD-MM-YYYY" atau "TEMPAT DD-MM-YYYY"
date_match = re.search(r'(\d{2}[-/]\d{2}[-/]\d{4})', ttl_text)
if date_match:
result['tanggal_lahir'] = date_match.group(1)
# Tempat adalah bagian sebelum tanggal
place = ttl_text[:date_match.start()].strip(' ,:-/.')
# Clean up label remnants
place = re.sub(r'^(tempat|tgl|lahir||:)[/\s:]*', '', place, flags=re.IGNORECASE).strip()
if place and len(place) > 2:
result['tempat_lahir'] = place.upper()
else:
# Coba split by comma
parts = ttl_text.split(',')
if len(parts) >= 2:
result['tempat_lahir'] = parts[0].strip().upper()
result['tanggal_lahir'] = parts[1].strip()
elif len(parts) == 1 and len(ttl_text) > 2:
result['tempat_lahir'] = ttl_text.upper()
def _post_process(self, result: Dict) -> Dict:
"""Post-processing hasil ekstraksi"""
# Validasi NIK (harus 16 digit)
if result['nik'] and not re.match(r'^\d{16}$', result['nik']):
cleaned = re.sub(r'\D', '', result['nik'])
if len(cleaned) == 16:
result['nik'] = cleaned
else:
result['nik'] = None
# Fix format tanggal lahir yang salah
# Pattern: DDMM-YYYY (contoh: 1608-1976) -> DD-MM-YYYY (16-08-1976)
if result['tanggal_lahir']:
tl = result['tanggal_lahir']
# Match DDMM-YYYY format (salah)
wrong_format = re.match(r'^(\d{2})(\d{2})-(\d{4})$', tl)
if wrong_format:
result['tanggal_lahir'] = f"{wrong_format.group(1)}-{wrong_format.group(2)}-{wrong_format.group(3)}"
print(f" [DATE FIX] '{tl}' -> '{result['tanggal_lahir']}'")
# Match DDMMYYYY format (tanpa separator)
no_sep_format = re.match(r'^(\d{2})(\d{2})(\d{4})$', tl)
if no_sep_format:
result['tanggal_lahir'] = f"{no_sep_format.group(1)}-{no_sep_format.group(2)}-{no_sep_format.group(3)}"
print(f" [DATE FIX] '{tl}' -> '{result['tanggal_lahir']}'")
# Clean all string values - remove leading colons and extra whitespace
for field in result:
if result[field] and isinstance(result[field], str):
val = result[field]
# Remove leading colons (standard and full-width)
val = re.sub(r'^[\s:]+', '', val)
# Remove trailing colons
val = re.sub(r'[\s:]+$', '', val)
# Remove double spaces
val = re.sub(r'\s+', ' ', val)
result[field] = val.strip()
# Bersihkan label dari values
for field in ['nama', 'alamat', 'tempat_lahir', 'kel_desa', 'kecamatan', 'agama', 'pekerjaan']:
if result[field]:
# Remove common labels yang ter-capture
result[field] = re.sub(
r'^(NAMA|ALAMAT|TEMPAT|LAHIR|TGL|KEL|DESA|KELURAHAN|KECAMATAN|KEC|AGAMA|PEKERJAAN|STATUS)[\s:]*',
'', result[field], flags=re.IGNORECASE
).strip()
# Fix status perkawinan yang masih mengandung label
if result['status_perkawinan']:
sp = result['status_perkawinan']
sp = re.sub(r'^(STATUS|PERKAWINAN)[\s:]*', '', sp, flags=re.IGNORECASE).strip()
result['status_perkawinan'] = sp
# Fix berlaku hingga
if result['berlaku_hingga']:
bh = result['berlaku_hingga']
bh = re.sub(r'^(BERLAKU|HINGGA)[\s:]*', '', bh, flags=re.IGNORECASE).strip()
if bh.upper() == 'HIDUP' or 'SEUMUR' in bh.upper():
result['berlaku_hingga'] = 'SEUMUR HIDUP'
else:
result['berlaku_hingga'] = bh
else:
# Fallback: Sesuai peraturan pemerintah, e-KTP berlaku seumur hidup
# Berlaku untuk e-KTP yang diterbitkan sejak 2011
result['berlaku_hingga'] = 'SEUMUR HIDUP'
print(" [FALLBACK] berlaku_hingga = SEUMUR HIDUP (peraturan pemerintah)")
# ============================================
# Parse nama Bali jika terdeteksi
# ============================================
# Deteksi apakah ini KTP Bali berdasarkan:
# 1. Provinsi = BALI
# 2. NIK dimulai dengan 51 (kode Bali)
# 3. Nama mengandung komponen khas Bali (NI, I GUSTI, dll)
is_bali = False
if result.get('provinsi') and 'BALI' in result['provinsi'].upper():
is_bali = True
elif result.get('nik') and result['nik'].startswith('51'):
is_bali = True
elif result.get('nama'):
nama_upper = result['nama'].upper()
# Cek apakah nama dimulai dengan prefix Bali
if nama_upper.startswith('NI') or nama_upper.startswith('IGUSTI') or \
nama_upper.startswith('IDABAGUS') or nama_upper.startswith('IDAAYU') or \
any(nama_upper.startswith(p) for p in ['GUSTI', 'WAYAN', 'MADE', 'NYOMAN', 'KETUT', 'PUTU', 'KADEK', 'KOMANG']):
is_bali = True
if is_bali and result.get('nama'):
result['nama'] = self._parse_balinese_name(result['nama'])
# ============================================
# Validasi dan koreksi Agama
# ============================================
if result.get('agama'):
agama = result['agama'].upper().strip()
# Fuzzy match terhadap daftar agama valid
agama_match = None
best_ratio = 0
for valid_agama in self.AGAMA_LIST:
ratio = difflib.SequenceMatcher(None, agama, valid_agama.upper()).ratio()
if ratio > best_ratio and ratio > 0.6:
best_ratio = ratio
agama_match = valid_agama.upper()
if agama_match:
if agama_match != agama:
print(f" [AGAMA VALIDATE] '{agama}' -> '{agama_match}' (ratio={best_ratio:.2f})")
result['agama'] = agama_match
# Tidak ada fallback otomatis untuk agama - harus dari OCR
# Fix merged kabupaten/kota names (e.g., JAKARTASELATAN -> JAKARTA SELATAN)
if result['kabupaten_kota']:
kk = result['kabupaten_kota']
# Add space before directional words
kk = re.sub(r'(JAKARTA|BANDUNG|SURABAYA|SEMARANG|MEDAN|BEKASI|TANGERANG|DEPOK|BOGOR)(SELATAN|UTARA|BARAT|TIMUR|PUSAT|TENGAH)',
r'\1 \2', kk, flags=re.IGNORECASE)
# Common merged patterns
kk = re.sub(r'(KOTA|KABUPATEN)([A-Z])', r'\1 \2', kk, flags=re.IGNORECASE)
result['kabupaten_kota'] = kk.upper()
# Fix merged provinsi names
if result['provinsi']:
prov = result['provinsi']
prov = re.sub(r'(DKI|DI)(JAKARTA|YOGYAKARTA)', r'\1 \2', prov, flags=re.IGNORECASE)
prov = re.sub(r'(JAWA|KALIMANTAN|SULAWESI|SUMATERA|NUSA TENGGARA)(BARAT|TIMUR|TENGAH|SELATAN|UTARA)',
r'\1 \2', prov, flags=re.IGNORECASE)
result['provinsi'] = prov.upper()
# Fix merged alamat/address (e.g., JLKECAPIV -> JL KECAPI V)
if result['alamat']:
alamat = result['alamat']
# Add space after common street prefixes
alamat = re.sub(r'^(JL|JLN|JALAN|GG|GANG|NO|BLOK)([A-Z])', r'\1 \2', alamat, flags=re.IGNORECASE)
# Add space before Roman numerals at the end (I, II, III, IV, V, VI, VII, VIII, IX, X)
alamat = re.sub(r'([A-Z])([IVX]+)$', r'\1 \2', alamat, flags=re.IGNORECASE)
# Add space before single digits/numbers at end
alamat = re.sub(r'([A-Z])(\d+)$', r'\1 \2', alamat, flags=re.IGNORECASE)
# Fix common patterns: "NO123" -> "NO 123", "BLOKA" -> "BLOK A"
alamat = re.sub(r'\b(NO|BLOK)(\d+|[A-Z])\b', r'\1 \2', alamat, flags=re.IGNORECASE)
result['alamat'] = alamat.upper()
# ============================================
# Cross-validation: Tempat Lahir vs Kel/Desa
# ============================================
# Pada KTP, tempat lahir sering sama dengan desa/kelurahan
# Jika tempat_lahir mirip dengan kel_desa, gunakan yang tervalidasi
if result.get('tempat_lahir') and result.get('kel_desa'):
tl = result['tempat_lahir'].upper()
kd = result['kel_desa'].upper()
# Hitung similarity
ratio = difflib.SequenceMatcher(None, tl, kd).ratio()
if ratio > 0.7:
# Tempat lahir mirip dengan kel/desa, gunakan kel/desa yang sudah divalidasi
print(f" [CROSS-VALIDATE] Tempat Lahir '{tl}' mirip dengan Kel/Desa '{kd}' (ratio={ratio:.2f})")
result['tempat_lahir'] = kd
elif ratio > 0.5:
# Cukup mirip, log untuk debugging
print(f" [CROSS-VALIDATE] Tempat Lahir '{tl}' mungkin sama dengan Kel/Desa '{kd}' (ratio={ratio:.2f})")
# Jika tempat_lahir kosong tapi kel_desa ada, mungkin sama
# (tidak otomatis mengisi karena bisa beda)
return result
if __name__ == "__main__":
# Test
sample_ocr = [
{'text': 'PROVINSI JAWA BARAT'},
{'text': 'KABUPATEN BANDUNG'},
{'text': 'NIK : 3204012345678901'},
{'text': 'Nama : JOHN DOE'},
{'text': 'Tempat/Tgl Lahir : BANDUNG, 01-01-1990'},
{'text': 'Jenis Kelamin : LAKI-LAKI'},
{'text': 'Alamat : JL. MERDEKA NO. 123'},
{'text': 'RT/RW : 001/002'},
{'text': 'Kel/Desa : SUKAMAJU'},
{'text': 'Kecamatan : SUKASARI'},
{'text': 'Agama : ISLAM'},
{'text': 'Status Perkawinan : BELUM KAWIN'},
{'text': 'Pekerjaan : KARYAWAN SWASTA'},
{'text': 'Kewarganegaraan : WNI'},
{'text': 'Berlaku Hingga : SEUMUR HIDUP'},
]
extractor = KTPExtractor()
result = extractor.extract(sample_ocr)
for key, value in result.items():
print(f"{key}: {value}")