1509 lines
71 KiB
Python
1509 lines
71 KiB
Python
"""
|
||
KTP Field Extractor
|
||
Ekstraksi data terstruktur dari hasil OCR KTP Indonesia
|
||
Mendukung berbagai format output OCR (full-width colon, standard colon, tanpa colon)
|
||
|
||
OPTIMIZED: Pre-compiled regex patterns for better performance
|
||
"""
|
||
|
||
import re
|
||
from typing import Dict, Optional, List
|
||
import difflib
|
||
|
||
# Debug mode - set to False for production
|
||
DEBUG_MODE = False
|
||
|
||
class KTPExtractor:
|
||
"""Ekstrak field dari hasil OCR KTP"""
|
||
|
||
# Pre-compiled regex patterns (optimization)
|
||
COLON_PATTERN = re.compile(r'[::]')
|
||
NIK_PATTERN = re.compile(r'\b(\d{16})\b')
|
||
DATE_PATTERN = re.compile(r'(\d{2}[-/]\d{2}[-/]\d{4})')
|
||
RT_RW_PATTERN = re.compile(r'(\d{3})\s*/\s*(\d{3})')
|
||
GOL_DARAH_PATTERN = re.compile(r'([ABO]{1,2}[+\-]?)', re.IGNORECASE)
|
||
PROVINSI_SPLIT_PATTERN = re.compile(r'(?i)provinsi\s*')
|
||
KABUPATEN_SPLIT_PATTERN = re.compile(r'(?i)\s*(kabupaten|kota)\s*')
|
||
TTL_PATTERN = re.compile(r'(?i)tempat[/\s]*tgl[/\s]*lahir|tempat[/\s]*lahir|lahir')
|
||
|
||
# Pattern colon string (for backward compatibility)
|
||
COLON_PATTERN_STR = r'[::]'
|
||
|
||
# Daftar Provinsi Indonesia (38 Provinsi)
|
||
PROVINSI_LIST = [
|
||
"ACEH", "SUMATERA UTARA", "SUMATERA BARAT", "RIAU", "JAMBI", "SUMATERA SELATAN", "BENGKULU", "LAMPUNG",
|
||
"KEPULAUAN BANGKA BELITUNG", "KEPULAUAN RIAU", "DKI JAKARTA", "JAWA BARAT", "JAWA TENGAH", "DI YOGYAKARTA",
|
||
"JAWA TIMUR", "BANTEN", "BALI", "NUSA TENGGARA BARAT", "NUSA TENGGARA TIMUR", "KALIMANTAN BARAT",
|
||
"KALIMANTAN TENGAH", "KALIMANTAN SELATAN", "KALIMANTAN TIMUR", "KALIMANTAN UTARA", "SULAWESI UTARA",
|
||
"SULAWESI TENGAH", "SULAWESI SELATAN", "SULAWESI TENGGARA", "GORONTALO", "SULAWESI BARAT", "MALUKU",
|
||
"MALUKU UTARA", "PAPUA BARAT", "PAPUA", "PAPUA SELATAN", "PAPUA TENGAH", "PAPUA PEGUNUNGAN", "PAPUA BARAT DAYA"
|
||
]
|
||
|
||
# Keywords untuk jenis kelamin
|
||
MALE_KEYWORDS = ['laki', 'pria', 'male']
|
||
FEMALE_KEYWORDS = ['perempuan', 'wanita', 'female']
|
||
|
||
# Agama yang valid
|
||
AGAMA_LIST = ['islam', 'kristen', 'katolik', 'hindu', 'budha', 'buddha', 'konghucu']
|
||
|
||
# Pekerjaan umum
|
||
PEKERJAAN_LIST = ['pelajar', 'mahasiswa', 'pegawai', 'swasta', 'pns', 'wiraswasta',
|
||
'buruh', 'petani', 'nelayan', 'karyawan', 'ibu rumah tangga',
|
||
'tidak bekerja', 'lainnya', 'mengurus rumah tangga']
|
||
|
||
# Status Perkawinan yang valid
|
||
STATUS_PERKAWINAN_LIST = ['BELUM KAWIN', 'KAWIN', 'CERAI HIDUP', 'CERAI MATI']
|
||
|
||
# Field Labels untuk fuzzy matching (mengatasi typo OCR seperti "Aamat" -> "ALAMAT")
|
||
FIELD_LABELS = {
|
||
'nama': ['NAMA'],
|
||
'alamat': ['ALAMAT'],
|
||
'agama': ['AGAMA'],
|
||
'pekerjaan': ['PEKERJAAN'],
|
||
'kewarganegaraan': ['KEWARGANEGARAAN', 'WARGANEGARA'],
|
||
'tempat_lahir': ['TEMPAT', 'LAHIR', 'TEMPAT/TGL LAHIR'],
|
||
'jenis_kelamin': ['JENIS KELAMIN', 'JENIS', 'KELAMIN'],
|
||
'gol_darah': ['GOL. DARAH', 'GOL DARAH', 'GOLONGAN DARAH'],
|
||
'kel_desa': ['KEL/DESA', 'KELURAHAN', 'DESA'],
|
||
'kecamatan': ['KECAMATAN', 'KEC'],
|
||
'status_perkawinan': ['STATUS PERKAWINAN', 'PERKAWINAN'],
|
||
'berlaku_hingga': ['BERLAKU HINGGA', 'BERLAKU'],
|
||
'rt_rw': ['RT/RW', 'RT', 'RW'],
|
||
}
|
||
|
||
# ============================================
|
||
# Sistem Penamaan Hindu Bali
|
||
# ============================================
|
||
# Struktur: [Prefix Gender] + [Gelar Kasta] + [Penanda Gender] + [Urutan Lahir] + [Nama Pribadi]
|
||
|
||
# Prefix penanda gender (harus di awal nama)
|
||
BALI_GENDER_PREFIX = {
|
||
'NI': 'PEREMPUAN', # Prefix untuk perempuan
|
||
'I': 'LAKI-LAKI', # Prefix untuk laki-laki
|
||
}
|
||
|
||
# Gelar Kasta (setelah prefix gender)
|
||
BALI_KASTA = {
|
||
'IDA': 'BRAHMANA',
|
||
'GUSTI': 'KSATRIA',
|
||
'ANAK AGUNG': 'KSATRIA',
|
||
'COKORDA': 'KSATRIA',
|
||
'DEWA': 'KSATRIA',
|
||
'DESAK': 'KSATRIA',
|
||
'AGUNG': 'KSATRIA',
|
||
'NGAKAN': 'WAISYA',
|
||
'SANG': 'WAISYA',
|
||
'SI': 'WAISYA',
|
||
}
|
||
|
||
# Penanda gender tambahan (setelah kasta)
|
||
BALI_GENDER_MARKER = {
|
||
'AYU': 'PEREMPUAN',
|
||
'ISTRI': 'PEREMPUAN',
|
||
'LUH': 'PEREMPUAN',
|
||
'BAGUS': 'LAKI-LAKI',
|
||
'GEDE': 'LAKI-LAKI',
|
||
'AGUS': 'LAKI-LAKI',
|
||
'ALIT': 'LAKI-LAKI', # Kecil/muda (untuk laki-laki)
|
||
}
|
||
|
||
# Urutan kelahiran (bersiklus setiap 4 anak)
|
||
BALI_BIRTH_ORDER = {
|
||
'PUTU': 1, 'WAYAN': 1, 'GEDE': 1, 'ILUH': 1,
|
||
'MADE': 2, 'KADEK': 2, 'NENGAH': 2,
|
||
'NYOMAN': 3, 'KOMANG': 3,
|
||
'KETUT': 4,
|
||
'BALIK': 5, # Untuk anak ke-5+ (siklus ulang)
|
||
}
|
||
|
||
# Soroh/Klan Bali (identifikasi garis keturunan)
|
||
BALI_SOROH = {
|
||
'PASEK': 'SOROH', # Klan mayoritas (~60% Hindu Bali)
|
||
'PANDE': 'SOROH', # Klan pandai besi/metalurgi
|
||
'ARYA': 'SOROH', # Klan Arya
|
||
'BENDESA': 'SOROH', # Pemimpin adat
|
||
'TANGKAS': 'SOROH', # Klan Tangkas
|
||
'CELAGI': 'SOROH', # Klan Celagi
|
||
'SENGGUHU': 'SOROH', # Klan Sengguhu
|
||
'KUBAYAN': 'SOROH', # Klan Kubayan
|
||
'BANDESA': 'SOROH', # Varian Bendesa
|
||
}
|
||
|
||
# Gabungkan semua komponen untuk deteksi (urut dari panjang ke pendek)
|
||
BALI_NAME_COMPONENTS = [
|
||
# Prefix gender
|
||
'NI', 'I',
|
||
# Kasta (prioritas: yang lebih panjang dulu)
|
||
'ANAK AGUNG', 'COKORDA', 'NGAKAN',
|
||
'IDA', 'GUSTI', 'DEWA', 'DESAK', 'AGUNG', 'SANG', 'SI',
|
||
# Soroh/Klan
|
||
'PASEK', 'PANDE', 'ARYA', 'BENDESA', 'BANDESA', 'TANGKAS', 'CELAGI', 'SENGGUHU', 'KUBAYAN',
|
||
# Gender marker
|
||
'AYU', 'ISTRI', 'LUH', 'BAGUS', 'GEDE', 'AGUS', 'ALIT',
|
||
# Urutan lahir
|
||
'WAYAN', 'PUTU', 'ILUH', 'MADE', 'KADEK', 'NENGAH', 'NYOMAN', 'KOMANG', 'KETUT', 'BALIK',
|
||
]
|
||
|
||
# KTP Zone Template (normalized coordinates: x_min, y_min, x_max, y_max)
|
||
# Based on standard KTP layout
|
||
ZONES = {
|
||
'header_provinsi': (0.15, 0.00, 0.85, 0.07), # PROVINSI header
|
||
'header_kabupaten': (0.15, 0.05, 0.85, 0.13), # KABUPATEN header
|
||
'nik': (0.02, 0.10, 0.70, 0.22), # NIK area
|
||
'nama': (0.02, 0.18, 0.70, 0.28), # Nama area
|
||
'ttl': (0.02, 0.25, 0.70, 0.36), # Tempat/Tgl Lahir
|
||
'jenis_kelamin': (0.02, 0.33, 0.45, 0.42), # Jenis Kelamin (left)
|
||
'gol_darah': (0.40, 0.33, 0.70, 0.42), # Gol Darah (right of jenis)
|
||
'alamat': (0.02, 0.38, 0.70, 0.50), # Alamat
|
||
'rt_rw': (0.02, 0.46, 0.70, 0.54), # RT/RW
|
||
'kel_desa': (0.02, 0.51, 0.70, 0.60), # Kel/Desa
|
||
'kecamatan': (0.02, 0.57, 0.70, 0.66), # Kecamatan
|
||
'agama': (0.02, 0.63, 0.70, 0.72), # Agama
|
||
'status': (0.02, 0.69, 0.70, 0.78), # Status Perkawinan
|
||
'pekerjaan': (0.02, 0.75, 0.70, 0.84), # Pekerjaan
|
||
'wni': (0.02, 0.81, 0.70, 0.90), # Kewarganegaraan
|
||
'berlaku': (0.02, 0.87, 0.70, 0.96), # Berlaku Hingga
|
||
'foto': (0.68, 0.10, 0.98, 0.55), # Foto (right side)
|
||
'penerbitan': (0.65, 0.58, 0.98, 0.98), # Tempat & Tanggal penerbitan
|
||
}
|
||
|
||
def __init__(self):
|
||
self.image_width = 0
|
||
self.image_height = 0
|
||
|
||
def _get_zone(self, x_center: float, y_center: float, img_width: int, img_height: int) -> Optional[str]:
|
||
"""Determine which zone a text belongs to based on normalized coordinates"""
|
||
if img_width == 0 or img_height == 0:
|
||
return None
|
||
|
||
# Normalize coordinates
|
||
x_norm = x_center / img_width
|
||
y_norm = y_center / img_height
|
||
|
||
for zone_name, (x_min, y_min, x_max, y_max) in self.ZONES.items():
|
||
if x_min <= x_norm <= x_max and y_min <= y_norm <= y_max:
|
||
return zone_name
|
||
return None
|
||
|
||
def _extract_value_from_text(self, text: str) -> str:
|
||
"""Extract value part from label:value text"""
|
||
# Split by colon (standard or full-width)
|
||
parts = re.split(r'[::]', text, 1)
|
||
if len(parts) > 1:
|
||
return parts[1].strip()
|
||
return text.strip()
|
||
|
||
def _find_best_match(self, text: str, candidates: List[str], cutoff: float = 0.6) -> Optional[str]:
|
||
"""Find best fuzzy match from candidates"""
|
||
matches = difflib.get_close_matches(text, candidates, n=1, cutoff=cutoff)
|
||
return matches[0] if matches else None
|
||
|
||
def _is_label_match(self, text: str, field_name: str, cutoff: float = 0.7) -> bool:
|
||
"""
|
||
Fuzzy match untuk label field - mengatasi typo OCR seperti "Aamat" -> "ALAMAT"
|
||
Returns True jika text cocok dengan salah satu label untuk field tersebut
|
||
"""
|
||
if not text or not text.strip():
|
||
return False
|
||
|
||
if field_name not in self.FIELD_LABELS:
|
||
return field_name.lower() in text.lower()
|
||
|
||
text_upper = text.upper().strip()
|
||
|
||
# Explicit conflict prevention
|
||
if field_name == 'agama' and 'ALAMAT' in text_upper:
|
||
return False
|
||
if field_name == 'alamat' and 'AGAMA' in text_upper:
|
||
return False
|
||
|
||
# Coba exact match dulu (lebih cepat)
|
||
for label in self.FIELD_LABELS[field_name]:
|
||
if label in text_upper:
|
||
return True
|
||
|
||
# Fuzzy match jika tidak ada exact match
|
||
# Ekstrak kata pertama dari text (biasanya label ada di awal)
|
||
parts = text_upper.split(':')[0].split()
|
||
if not parts:
|
||
return False
|
||
first_word = parts[0]
|
||
|
||
for label in self.FIELD_LABELS[field_name]:
|
||
label_parts = label.split()
|
||
if not label_parts:
|
||
continue
|
||
# Bandingkan dengan kata pertama
|
||
ratio = difflib.SequenceMatcher(None, first_word, label_parts[0]).ratio()
|
||
|
||
# Dynamic cutoff logic
|
||
effective_cutoff = cutoff
|
||
if len(first_word) < 7:
|
||
# Use stricter cutoff for short words to prevent ALAMAT (6) matching AGAMA (5) -> ratio 0.73
|
||
effective_cutoff = max(cutoff, 0.82)
|
||
|
||
if ratio >= effective_cutoff:
|
||
if DEBUG_MODE:
|
||
print(f" [FUZZY LABEL] '{first_word}' matched '{label}' (ratio={ratio:.2f})")
|
||
return True
|
||
|
||
return False
|
||
|
||
def _extract_after_label(self, text: str, label_pattern: str) -> Optional[str]:
|
||
"""
|
||
Extract content after a label (fuzzy/regex match).
|
||
Handles cases with/without colons.
|
||
"""
|
||
if not text: return None
|
||
|
||
# 1. Try Regex Search if pattern provided
|
||
if label_pattern:
|
||
# Construct regex: Label + optional spaces/colon + (Group 1: Value)
|
||
# flags=re.IGNORECASE should be used
|
||
# We want to find the END of the label
|
||
match = re.search(f"({label_pattern})[:\\s]*", text, re.IGNORECASE)
|
||
if match:
|
||
# Return everything after the match end
|
||
return text[match.end():].strip()
|
||
|
||
return None
|
||
|
||
def _parse_balinese_name(self, name: str) -> str:
|
||
"""
|
||
Parse nama Bali yang digabung OCR dan tambahkan spasi yang tepat.
|
||
Contoh: "NIGUSTIAYUNYOMANSUWETRI" -> "NI GUSTI AYU NYOMAN SUWETRI"
|
||
|
||
Struktur nama Bali:
|
||
[Prefix Gender] + [Gelar Kasta] + [Penanda Gender] + [Urutan Lahir] + [Nama Pribadi]
|
||
|
||
PENTING: Hanya proses jika nama benar-benar mengandung komponen Bali!
|
||
"""
|
||
if not name:
|
||
return name
|
||
|
||
name_upper = name.upper().strip()
|
||
|
||
# Jika sudah ada spasi dengan jumlah wajar, kembalikan apa adanya
|
||
if name_upper.count(' ') >= 2:
|
||
return name_upper
|
||
|
||
# Cek apakah nama mengandung komponen Bali
|
||
# Nama harus dimulai dengan NI, I GUSTI, IDA, atau komponen urutan lahir Bali
|
||
name_clean = name_upper.replace(' ', '')
|
||
|
||
is_balinese_name = False
|
||
# Cek prefix khas Bali
|
||
if name_clean.startswith('NI') and len(name_clean) > 3:
|
||
# NI harus diikuti komponen Bali lain (GUSTI, LUH, WAYAN, dll)
|
||
after_ni = name_clean[2:]
|
||
for comp in ['GUSTI', 'LUH', 'WAYAN', 'MADE', 'NYOMAN', 'KETUT', 'PUTU', 'KADEK', 'KOMANG', 'PASEK', 'PANDE']:
|
||
if after_ni.startswith(comp):
|
||
is_balinese_name = True
|
||
break
|
||
elif name_clean.startswith('IGUSTI') or name_clean.startswith('IDABAGUS') or name_clean.startswith('IDAAYU'):
|
||
is_balinese_name = True
|
||
elif any(name_clean.startswith(p) for p in ['GUSTI', 'WAYAN', 'PUTU', 'MADE', 'KADEK', 'NYOMAN', 'KOMANG', 'KETUT']):
|
||
is_balinese_name = True
|
||
|
||
if not is_balinese_name:
|
||
# Bukan nama Bali, kembalikan dengan pemisahan spasi standar
|
||
# Jika ada 1 spasi, kembalikan apa adanya
|
||
if ' ' in name_upper:
|
||
return name_upper
|
||
# Jika tidak ada spasi sama sekali, kembalikan apa adanya (mungkin memang 1 kata)
|
||
return name_upper
|
||
|
||
# Urutan komponen yang akan dicari (dari yang terpanjang ke terpendek untuk akurasi)
|
||
components_ordered = sorted(self.BALI_NAME_COMPONENTS, key=len, reverse=True)
|
||
|
||
result_parts = []
|
||
remaining = name_clean
|
||
|
||
# Parse prefix gender (NI atau I di awal)
|
||
if remaining.startswith('NI'):
|
||
result_parts.append('NI')
|
||
remaining = remaining[2:]
|
||
elif remaining.startswith('I') and len(remaining) > 1:
|
||
# Pastikan bukan bagian dari kata lain
|
||
next_char = remaining[1] if len(remaining) > 1 else ''
|
||
# Cek apakah karakter setelah I adalah konsonan (bukan vokal)
|
||
if next_char not in 'AIUEO':
|
||
result_parts.append('I')
|
||
remaining = remaining[1:]
|
||
|
||
# Parse komponen-komponen lainnya
|
||
found = True
|
||
max_iterations = 10 # Prevent infinite loop
|
||
iteration = 0
|
||
|
||
while remaining and found and iteration < max_iterations:
|
||
found = False
|
||
iteration += 1
|
||
|
||
for component in components_ordered:
|
||
if remaining.startswith(component):
|
||
# Skip jika komponen sudah ada di result (kecuali nama pribadi)
|
||
if component not in result_parts or component not in self.BALI_NAME_COMPONENTS:
|
||
result_parts.append(component)
|
||
remaining = remaining[len(component):]
|
||
found = True
|
||
break
|
||
|
||
# Sisa adalah nama pribadi
|
||
if remaining:
|
||
result_parts.append(remaining)
|
||
|
||
parsed_name = ' '.join(result_parts)
|
||
|
||
# Log jika ada perubahan
|
||
if parsed_name != name_upper:
|
||
print(f" [BALI NAME] '{name_upper}' -> '{parsed_name}'")
|
||
|
||
return parsed_name
|
||
|
||
def _search_best_match_in_text(self, text: str, candidates: List[str], prefix: str = "") -> tuple:
|
||
"""
|
||
Search if any candidate is present in text using multiple strategies:
|
||
1. Exact substring
|
||
2. Prefix + Candidate (Fuzzy) - e.g. "PROVINSI BALI"
|
||
3. Candidate Only (Fuzzy) - e.g. "BALI" (if prefix is missing/damaged)
|
||
Returns (best_candidate, confidence_score)
|
||
"""
|
||
text_upper = text.upper()
|
||
best_match = None
|
||
best_ratio = 0.0
|
||
|
||
# Strategy 1: Exact substring match (fastest & most reliable)
|
||
for candidate in candidates:
|
||
if candidate in text_upper:
|
||
if len(candidate) > len(best_match or ""):
|
||
best_match = candidate
|
||
best_ratio = 1.0
|
||
|
||
if best_ratio == 1.0:
|
||
return best_match, best_ratio
|
||
|
||
# Strategy 2: Prefix Construction & Fuzzy Match
|
||
prefix_upper = prefix.upper() if prefix else ""
|
||
|
||
# DEBUG: Print checking (controlled by DEBUG_MODE)
|
||
if DEBUG_MODE:
|
||
print(f"DEBUG Check Text: '{text_upper}' with Prefix: '{prefix_upper}'")
|
||
|
||
for candidate in candidates:
|
||
# 2a. Compare with Prefix + Space (e.g. "PROVINSI BALI")
|
||
if prefix:
|
||
target_spaced = f"{prefix_upper} {candidate}"
|
||
s_spaced = difflib.SequenceMatcher(None, target_spaced, text_upper)
|
||
ratio_spaced = s_spaced.ratio()
|
||
|
||
# print(f" -> Compare '{target_spaced}' vs '{text_upper}' = {ratio_spaced:.2f}")
|
||
|
||
if ratio_spaced > best_ratio and ratio_spaced > 0.5:
|
||
best_ratio = ratio_spaced
|
||
best_match = candidate
|
||
|
||
# 2b. Compare with Prefix NO SPACE (e.g. "PROVINSIBALI")
|
||
# This handles "PROVNSIBALI" perfectly
|
||
target_merged = f"{prefix_upper}{candidate}"
|
||
s_merged = difflib.SequenceMatcher(None, target_merged, text_upper)
|
||
ratio_merged = s_merged.ratio()
|
||
|
||
if DEBUG_MODE:
|
||
print(f" -> Compare Merged '{target_merged}' vs '{text_upper}' = {ratio_merged:.2f}")
|
||
|
||
if ratio_merged > best_ratio and ratio_merged > 0.5:
|
||
best_ratio = ratio_merged
|
||
best_match = candidate
|
||
|
||
# 2c. Compare Candidate ONLY (e.g. "BALI")
|
||
if len(candidate) > 3:
|
||
s_raw = difflib.SequenceMatcher(None, candidate, text_upper)
|
||
ratio_raw = s_raw.ratio()
|
||
|
||
# print(f" -> Compare Raw '{candidate}' vs '{text_upper}' = {ratio_raw:.2f}")
|
||
|
||
if ratio_raw > best_ratio and ratio_raw > 0.6:
|
||
best_ratio = ratio_raw
|
||
best_match = candidate
|
||
|
||
if DEBUG_MODE:
|
||
print(f"DEBUG Best Match: {best_match} ({best_ratio:.2f})")
|
||
return best_match, best_ratio
|
||
|
||
def _detect_image_size(self, ocr_results: List[Dict]) -> tuple:
|
||
"""Detect image dimensions from bounding boxes"""
|
||
max_x, max_y = 0, 0
|
||
for r in ocr_results:
|
||
bbox = r.get('bbox', [])
|
||
if bbox and len(bbox) >= 4:
|
||
for point in bbox:
|
||
if len(point) >= 2:
|
||
max_x = max(max_x, point[0])
|
||
max_y = max(max_y, point[1])
|
||
# Add some margin
|
||
return (int(max_x * 1.05), int(max_y * 1.05)) if max_x > 0 else (1000, 640)
|
||
|
||
def _extract_by_zones(self, zone_texts: Dict[str, List[str]], result: Dict):
|
||
"""Extract fields based on zone assignments"""
|
||
|
||
# PROVINSI from header
|
||
if 'header_provinsi' in zone_texts:
|
||
print(f"DEBUG Zone Provinsi Content: {zone_texts['header_provinsi']}")
|
||
for text in zone_texts['header_provinsi']:
|
||
text_clean = text.strip()
|
||
# Use prefix strategy: "PROVINSI " + result vs text
|
||
match, score = self._search_best_match_in_text(text_clean, self.PROVINSI_LIST, prefix="PROVINSI")
|
||
|
||
# LOWER THRESHOLD to 0.5 because "PROVINSI BALI" vs "PROVNSIBALI" is roughly 0.5-0.6 range
|
||
if match and score > 0.5:
|
||
result['provinsi'] = match
|
||
|
||
# Remove the found province (and label) from text to see what's left
|
||
# If we matched "PROVINSI JAWA TIMUR", the text might be "PROVNSIJAWATMRKABUPATENSUMENEP"
|
||
# It's hard to cleanly remove "PROVISI JAWA TIMUR" if it was fuzzy matched.
|
||
|
||
# BUT, we can try to find "KABUPATEN" or "KOTA" in the original text
|
||
# independent of the province match
|
||
if 'kabupaten' in text_clean.lower() or 'kota' in text_clean.lower():
|
||
parts = re.split(r'(?i)\s*(kabupaten|kota)', text_clean)
|
||
if len(parts) > 1:
|
||
kab_part = "".join(parts[1:]).strip()
|
||
kab_val = re.sub(r'^(?i)(kabupaten|kota)\s*', '', kab_part).strip()
|
||
if kab_val and result['kabupaten_kota'] is None:
|
||
prefix = "KABUPATEN" if "kabupaten" in text_clean.lower() else "KOTA"
|
||
result['kabupaten_kota'] = f"{prefix} {kab_val.upper()}"
|
||
break
|
||
|
||
# Fallback to keyword splitting (Legacy/Blurry fallback)
|
||
text_lower = text.lower()
|
||
val = text
|
||
|
||
# If keyword exists, strip it
|
||
if 'provinsi' in text_lower:
|
||
split_prov = re.split(r'(?i)provinsi\s*', text, 1)
|
||
if len(split_prov) > 1:
|
||
val = split_prov[1].strip()
|
||
else:
|
||
val = ""
|
||
|
||
# Check for merged text
|
||
if 'kabupaten' in text_lower or 'kota' in text_lower:
|
||
parts = re.split(r'(?i)\s*(kabupaten|kota)', val)
|
||
val = parts[0].strip()
|
||
|
||
if len(parts) > 1:
|
||
kab_part = "".join(parts[1:]).strip()
|
||
kab_val = re.sub(r'^(?i)(kabupaten|kota)\s*', '', kab_part).strip()
|
||
if kab_val and result['kabupaten_kota'] is None:
|
||
prefix = "KABUPATEN" if "kabupaten" in text_lower else "KOTA"
|
||
result['kabupaten_kota'] = f"{prefix} {kab_val.upper()}"
|
||
|
||
if val and len(val) > 2:
|
||
# Try fuzzy match again on the cleaned value
|
||
best_match = self._find_best_match(val.upper(), self.PROVINSI_LIST, cutoff=0.6)
|
||
if best_match:
|
||
result['provinsi'] = best_match
|
||
else:
|
||
result['provinsi'] = val.upper()
|
||
break
|
||
|
||
# KABUPATEN/KOTA from header
|
||
if 'header_kabupaten' in zone_texts:
|
||
for text in zone_texts['header_kabupaten']:
|
||
text_lower = text.lower()
|
||
val = text
|
||
|
||
# Check keyword
|
||
if 'kabupaten' in text_lower or 'kota' in text_lower:
|
||
split_kab = re.split(r'(?i)\s*(kabupaten|kota)\s*', text, 1)
|
||
if len(split_kab) > 1:
|
||
val = split_kab[-1].strip()
|
||
else:
|
||
val = ""
|
||
|
||
# If no keyword, but it's in the kabupaten zone, assume it's data
|
||
if val:
|
||
# Re-add prefix standard if we separated it or if it was missing
|
||
# Heuristic: if validation suggests it's a known regency, we are good.
|
||
# For now, standardize format.
|
||
if result['kabupaten_kota'] is None:
|
||
prefix = "KABUPATEN" if "kabupaten" in text_lower else "KOTA"
|
||
# If no keyword found, default to KABUPATEN? Or better check Wilayah?
|
||
# Let's default to detected keyword or KABUPATEN
|
||
if "kota" in text_lower:
|
||
prefix = "KOTA"
|
||
else:
|
||
prefix = "KABUPATEN"
|
||
|
||
result['kabupaten_kota'] = f"{prefix} {val.upper()}"
|
||
break
|
||
|
||
# NAMA from nama zone (skip label line)
|
||
if 'nama' in zone_texts:
|
||
for text in zone_texts['nama']:
|
||
text_lower = text.lower()
|
||
if 'nama' not in text_lower and len(text) > 2:
|
||
result['nama'] = text.upper()
|
||
break
|
||
elif 'nama' in text_lower:
|
||
val = self._extract_value_from_text(text)
|
||
if val and 'nama' not in val.lower():
|
||
result['nama'] = val.upper()
|
||
|
||
# TTL from ttl zone
|
||
if 'ttl' in zone_texts:
|
||
for text in zone_texts['ttl']:
|
||
# Skip if text is JUST the label (length check or fuzzy match)
|
||
if len(text) < 15 and self._is_label_match(text, 'tempat_lahir'):
|
||
continue
|
||
|
||
if 'tempat' in text.lower() or 'lahir' in text.lower() or 'tgl' in text.lower() or len(text) > 5:
|
||
val = self._extract_value_from_text(text)
|
||
if val:
|
||
# Don't accept if val looks like label
|
||
if self._is_label_match(val, 'tempat_lahir') and len(val) < 20:
|
||
continue
|
||
|
||
self._parse_ttl(val, result)
|
||
# Only break if we actually got a birth date, otherwise keep looking
|
||
if result['tanggal_lahir']:
|
||
break
|
||
|
||
# JENIS KELAMIN
|
||
if 'jenis_kelamin' in zone_texts:
|
||
for text in zone_texts['jenis_kelamin']:
|
||
text_lower = text.lower()
|
||
if 'laki' in text_lower:
|
||
result['jenis_kelamin'] = 'LAKI-LAKI'
|
||
break
|
||
elif 'perempuan' in text_lower or 'wanita' in text_lower:
|
||
result['jenis_kelamin'] = 'PEREMPUAN'
|
||
break
|
||
|
||
# GOL DARAH
|
||
if 'gol_darah' in zone_texts:
|
||
for text in zone_texts['gol_darah']:
|
||
gol_match = re.search(r'([ABO]{1,2}[+\-]?)', text, re.IGNORECASE)
|
||
if gol_match:
|
||
result['gol_darah'] = gol_match.group(1).upper()
|
||
break
|
||
|
||
# ALAMAT
|
||
if 'alamat' in zone_texts:
|
||
for text in zone_texts['alamat']:
|
||
if 'alamat' not in text.lower() or len(zone_texts['alamat']) == 1:
|
||
val = self._extract_value_from_text(text) if 'alamat' in text.lower() else text
|
||
if val and 'alamat' not in val.lower():
|
||
result['alamat'] = val.upper()
|
||
break
|
||
|
||
# RT/RW
|
||
if 'rt_rw' in zone_texts:
|
||
for text in zone_texts['rt_rw']:
|
||
rt_rw_match = re.search(r'(\d{3})\s*/\s*(\d{3})', text)
|
||
if rt_rw_match:
|
||
result['rt_rw'] = f"{rt_rw_match.group(1)}/{rt_rw_match.group(2)}"
|
||
break
|
||
|
||
# KEL/DESA
|
||
if 'kel_desa' in zone_texts:
|
||
for text in zone_texts['kel_desa']:
|
||
if 'kel' in text.lower() or 'desa' in text.lower():
|
||
val = self._extract_value_from_text(text)
|
||
if val and 'kel' not in val.lower():
|
||
result['kel_desa'] = val.upper()
|
||
break
|
||
elif result['kel_desa'] is None:
|
||
# Fallback context: simple text
|
||
result['kel_desa'] = text.upper()
|
||
|
||
# KECAMATAN
|
||
if 'kecamatan' in zone_texts:
|
||
for text in zone_texts['kecamatan']:
|
||
if 'kec' in text.lower():
|
||
val = self._extract_value_from_text(text)
|
||
if val and 'kec' not in val.lower():
|
||
result['kecamatan'] = val.upper()
|
||
break
|
||
elif result['kecamatan'] is None:
|
||
result['kecamatan'] = text.upper()
|
||
|
||
# AGAMA
|
||
if 'agama' in zone_texts:
|
||
for text in zone_texts['agama']:
|
||
val = text.upper()
|
||
if 'agama' in text.lower():
|
||
val = self._extract_value_from_text(text).upper()
|
||
|
||
# Verify against valid list
|
||
for agama in self.AGAMA_LIST:
|
||
if agama.upper() in val:
|
||
result['agama'] = agama.upper()
|
||
break
|
||
if result['agama']: break
|
||
|
||
# STATUS PERKAWINAN
|
||
if 'status' in zone_texts:
|
||
for text in zone_texts['status']:
|
||
val = text.upper()
|
||
# Normalize common OCR errors (e.g. BELUMKAWIN)
|
||
val = val.replace("BELUMKAWIN", "BELUM KAWIN")
|
||
|
||
# Check against official list
|
||
found_status = False
|
||
for status in self.STATUS_PERKAWINAN_LIST:
|
||
if status in val:
|
||
result['status_perkawinan'] = status
|
||
found_status = True
|
||
break
|
||
if found_status: break
|
||
|
||
# PEKERJAAN
|
||
if 'pekerjaan' in zone_texts:
|
||
best_job = None
|
||
potential_job = None
|
||
|
||
for text in zone_texts['pekerjaan']:
|
||
val = text.upper()
|
||
if 'pekerjaan' in text.lower():
|
||
val = self._extract_value_from_text(text).upper()
|
||
|
||
# Clean up
|
||
val = val.strip()
|
||
if not val or len(val) < 3 or 'PEKERJAAN' in val:
|
||
continue
|
||
|
||
# 1. Check against wildcard/list (Priority)
|
||
# Buruh, Karyawan, Pelajar, dll
|
||
if any(job.upper() in val for job in self.PEKERJAAN_LIST):
|
||
best_job = val
|
||
break # Found a definitive job
|
||
|
||
# 2. Save as potential if it's NOT a known bad value (like City names)
|
||
# Avoid capturing 'TABANAN', 'JAKARTA', date strings
|
||
if not any(city in val for city in ['KABUPATEN', 'KOTA', 'TABANAN', 'BADUNG', 'DENPASAR', 'JAKARTA', 'BANDUNG']):
|
||
if not re.search(r'\d{2}-\d{2}-\d{4}', val): # Avoid dates
|
||
if potential_job is None:
|
||
potential_job = val
|
||
|
||
if best_job:
|
||
result['pekerjaan'] = best_job
|
||
elif potential_job:
|
||
result['pekerjaan'] = potential_job
|
||
|
||
# WNI
|
||
if 'wni' in zone_texts:
|
||
for text in zone_texts['wni']:
|
||
if 'wni' in text.lower():
|
||
result['kewarganegaraan'] = 'WNI'
|
||
break
|
||
elif 'wna' in text.lower():
|
||
result['kewarganegaraan'] = 'WNA'
|
||
break
|
||
|
||
# PENERBITAN area (tempat & tanggal dalam satu zona)
|
||
if 'penerbitan' in zone_texts:
|
||
for text in zone_texts['penerbitan']:
|
||
# Look for date
|
||
date_match = re.search(r'(\d{2}[-/]\d{2}[-/]\d{4})', text)
|
||
if date_match and result['tanggal_penerbitan'] is None:
|
||
result['tanggal_penerbitan'] = date_match.group(1)
|
||
|
||
def extract(self, ocr_results: List[Dict]) -> Dict[str, Optional[str]]:
|
||
"""
|
||
Ekstrak field KTP dari hasil OCR dengan template-based zone detection
|
||
|
||
Args:
|
||
ocr_results: List hasil dari OCREngine.extract_text()
|
||
|
||
Returns:
|
||
Dict dengan field KTP
|
||
"""
|
||
result = {
|
||
'nik': None,
|
||
'nama': None,
|
||
'tempat_lahir': None,
|
||
'tanggal_lahir': None,
|
||
'jenis_kelamin': None,
|
||
'gol_darah': None,
|
||
'alamat': None,
|
||
'rt_rw': None,
|
||
'kel_desa': None,
|
||
'kecamatan': None,
|
||
'agama': None,
|
||
'status_perkawinan': None,
|
||
'pekerjaan': None,
|
||
'kewarganegaraan': None,
|
||
'berlaku_hingga': 'SEUMUR HIDUP', # Default sesuai peraturan pemerintah e-KTP
|
||
'provinsi': None,
|
||
'kabupaten_kota': None,
|
||
'tanggal_penerbitan': None,
|
||
}
|
||
|
||
# Detect image dimensions from bounding boxes
|
||
img_width, img_height = self._detect_image_size(ocr_results)
|
||
|
||
# Assign zones to each OCR result
|
||
zone_texts = {} # zone_name -> list of texts
|
||
for r in ocr_results:
|
||
x_center = r.get('x_center', 0)
|
||
y_center = r.get('y_center', 0)
|
||
zone = self._get_zone(x_center, y_center, img_width, img_height)
|
||
if zone:
|
||
if zone not in zone_texts:
|
||
zone_texts[zone] = []
|
||
zone_texts[zone].append(r['text'])
|
||
|
||
# Debug: print zone assignments
|
||
print("\n[DEBUG KTPExtractor] Zone assignments:")
|
||
for zone, texts in zone_texts.items():
|
||
print(f" {zone}: {texts}")
|
||
|
||
# Extract fields using zone-based approach
|
||
self._extract_by_zones(zone_texts, result)
|
||
|
||
# Gabungkan semua teks untuk fallback pattern matching
|
||
texts = [r['text'].strip() for r in ocr_results]
|
||
all_text = '\n'.join(texts)
|
||
|
||
# Ekstrak NIK (16 digit) - bisa ada di mana saja
|
||
nik_match = re.search(r'\b(\d{16})\b', all_text)
|
||
if nik_match:
|
||
result['nik'] = nik_match.group(1)
|
||
print(f" -> NIK found: {result['nik']}")
|
||
|
||
# Fallback: Parse line by line for fields not found by zone
|
||
for i, text in enumerate(texts):
|
||
# Skip baris yang hanya berisi punctuation atau kosong
|
||
text_stripped = text.strip()
|
||
if not text_stripped or text_stripped in [':', ':', '.', '-', '/', '|']:
|
||
continue
|
||
# Skip baris yang terlalu pendek (hanya 1-2 karakter non-alfanumerik)
|
||
if len(text_stripped) <= 2 and not any(c.isalnum() for c in text_stripped):
|
||
continue
|
||
|
||
text_lower = text.lower()
|
||
|
||
# Normalize colons
|
||
text_normalized = re.sub(self.COLON_PATTERN, ':', text)
|
||
text_norm_lower = text_normalized.lower()
|
||
|
||
# ===== PROVINSI =====
|
||
if 'provinsi' in text_lower and result['provinsi'] is None:
|
||
# Split by PROVINSI and take remainder
|
||
split_prov = re.split(r'(?i)provinsi\s*', text, 1)
|
||
if len(split_prov) > 1:
|
||
val = split_prov[1].strip()
|
||
# Check if it contains kabupaten/kota (merged line case)
|
||
if 'kabupaten' in val.lower() or 'kota' in val.lower():
|
||
parts = re.split(r'(?i)\s*(kabupaten|kota)', val)
|
||
val = parts[0].strip()
|
||
|
||
if val:
|
||
# Fuzzy match against valid provinces
|
||
best_match = self._find_best_match(val.upper(), self.PROVINSI_LIST, cutoff=0.6)
|
||
if best_match:
|
||
result['provinsi'] = best_match
|
||
else:
|
||
result['provinsi'] = val.upper()
|
||
|
||
# Check for next line if current line only had 'PROVINSI'
|
||
if result['provinsi'] is None and i + 1 < len(texts):
|
||
next_text = texts[i+1].strip()
|
||
next_lower = next_text.lower()
|
||
# Only take next line if it doesn't look like another field
|
||
if not any(kw in next_lower for kw in ['provinsi', 'kabupaten', 'kota', 'nik']):
|
||
# Fuzzy match next line
|
||
val = next_text.upper()
|
||
best_match = self._find_best_match(val, self.PROVINSI_LIST, cutoff=0.6)
|
||
if best_match:
|
||
result['provinsi'] = best_match
|
||
else:
|
||
result['provinsi'] = val
|
||
|
||
# ===== KABUPATEN/KOTA =====
|
||
if ('kabupaten' in text_lower or 'kota' in text_lower or 'jakarta' in text_lower) and result['kabupaten_kota'] is None:
|
||
if 'provinsi' not in text_lower: # Bukan bagian dari provinsi
|
||
# Split by KABUPATEN or KOTA and take remainder
|
||
split_kab = re.split(r'(?i)\s*(kabupaten|kota)\s*', text, 1)
|
||
if len(split_kab) > 1:
|
||
prefix = "KABUPATEN" if "kabupaten" in text_lower else "KOTA"
|
||
val = split_kab[-1].strip()
|
||
if val:
|
||
result['kabupaten_kota'] = f"{prefix} {val.upper()}"
|
||
else:
|
||
result['kabupaten_kota'] = text.strip().upper()
|
||
else:
|
||
result['kabupaten_kota'] = text.strip().upper()
|
||
|
||
# ===== NAMA =====
|
||
if result['nama'] is None and self._is_label_match(text, 'nama'):
|
||
val = self._extract_after_label(text_normalized, 'nama')
|
||
current_name = ""
|
||
|
||
if val:
|
||
current_name = val.upper()
|
||
|
||
# Loop check baris berikutnya for Name (handle 2-3 lines)
|
||
offset = 1
|
||
# Batasi maksimal 2 baris tambahan untuk Nama (total 3 baris)
|
||
while i + offset < len(texts) and offset <= 2:
|
||
next_text = texts[i+offset].strip()
|
||
next_lower = next_text.lower()
|
||
|
||
is_stop = False
|
||
|
||
# 1. Check Stop Keywords (Field Labels below Name)
|
||
# Stop if next line is Tempat Lahir, Jenis Kelamin, Alamat, etc.
|
||
stop_keywords = ['tempat', 'lahir', 'tgl', 'jenis', 'kelamin', 'alamat', 'rt/rw', 'nik']
|
||
if any(kw in next_lower for kw in stop_keywords):
|
||
is_stop = True
|
||
print(f" [NAMA STOP] Matched stop keyword in '{next_text}'")
|
||
|
||
# 2. Check Case Sensitivity (Heuristic)
|
||
if not is_stop:
|
||
letters = [c for c in next_text if c.isalpha()]
|
||
if letters:
|
||
upper_count = sum(1 for c in letters if c.isupper())
|
||
upper_ratio = upper_count / len(letters)
|
||
# If mostly lowercase/title case, likely a label (e.g. "Tempat Lahir")
|
||
if upper_ratio < 0.4 and len(letters) > 3:
|
||
is_stop = True
|
||
print(f" [NAMA STOP] Likely Label based on Case (Ratio={upper_ratio:.2f})")
|
||
|
||
if not is_stop:
|
||
if len(next_text) > 2:
|
||
print(f" [NAMA MERGE] Merging '{next_text}'")
|
||
if current_name:
|
||
current_name += " " + next_text.upper()
|
||
else:
|
||
current_name = next_text.upper()
|
||
offset += 1
|
||
else:
|
||
print(f" [NAMA SKIP] Too short '{next_text}'")
|
||
# Kalau terlalu pendek (noise), boleh skip atau stop?
|
||
# Biasanya nama tidak putus jadi 1 huruf. Anggap stop utk aman, atau skip.
|
||
# Kita skip saja increment offset.
|
||
offset += 1
|
||
else:
|
||
break
|
||
|
||
if current_name:
|
||
# Fix Spacing Issues (e.g. BAGUSGEDE -> BAGUS GEDE)
|
||
current_name = re.sub(r'(BAGUS)(GEDE)', r'\1 \2', current_name)
|
||
current_name = re.sub(r'(ANAK)(AGUNG)', r'\1 \2', current_name) # Common issue
|
||
result['nama'] = current_name
|
||
|
||
# ===== TEMPAT/TANGGAL LAHIR =====
|
||
# ... (starts around line 830 in original) ...
|
||
|
||
# (Skipping down to ALAMAT section for the replacement block)
|
||
# ... regex find ...
|
||
|
||
# ===== ALAMAT ===== (dengan fuzzy label matching)
|
||
if result['alamat'] is None and self._is_label_match(text, 'alamat'):
|
||
val = self._extract_after_label(text_normalized, r'a{1,2}l{0,2}a?m{0,2}a?t')
|
||
|
||
# Logic multi-line
|
||
current_addr = ""
|
||
if val:
|
||
current_addr = val.upper()
|
||
|
||
# Loop check baris berikutnya (bisa ambil i+1, i+2, dst selama bukan label)
|
||
offset = 1
|
||
while i + offset < len(texts):
|
||
next_text = texts[i+offset].strip()
|
||
print(f" [ALAMAT CHECK] Offset +{offset}: '{next_text}'")
|
||
|
||
next_lower = next_text.lower()
|
||
is_stop = False
|
||
|
||
# 1. Cek Pola RT/RW (angka/angka) -> Pasti STOP
|
||
if re.search(r'\d{3}\s*/\s*\d{3}', next_text) or re.match(r'^[.\-]+\s*/\s*[.\-]+$', next_text):
|
||
is_stop = True
|
||
print(" [ALAMAT STOP] Matched RT/RW pattern")
|
||
|
||
# 2. Cek Keywords Label Pembatas
|
||
elif any(next_lower.startswith(prefix) for prefix in ['rt/', 'rw', 'rt/rw', 'kel', 'desa', 'kec', 'agama', 'status', 'kawin']):
|
||
is_stop = True
|
||
print(" [ALAMAT STOP] Matched label prefix")
|
||
|
||
# 3. Cek Keywords Spesifik Full Word
|
||
elif any(kw in next_lower for kw in ['kelurahan', 'kecamatan', 'perkawinan', 'kewarganegaraan']):
|
||
is_stop = True
|
||
print(" [ALAMAT STOP] Matched distinct label word")
|
||
|
||
# 4. Check Case Sensitivity
|
||
if not is_stop:
|
||
letters = [c for c in next_text if c.isalpha()]
|
||
if letters:
|
||
upper_count = sum(1 for c in letters if c.isupper())
|
||
upper_ratio = upper_count / len(letters)
|
||
# Jika hampir semua huruf kecil/Title Case (ratio < 0.4), dicurigai sebagai Label
|
||
# Kecuali kata-kata pendek (< 5 chars)
|
||
if upper_ratio < 0.4 and len(letters) > 4:
|
||
is_stop = True
|
||
print(f" [ALAMAT STOP] Detected Title Case/Lowercase (Ratio={upper_ratio:.2f}) -> Likely Label")
|
||
|
||
# Jika BUKAN pembatas, AMBIL sebagai lanjutan alamat
|
||
if not is_stop:
|
||
if len(next_text) > 1:
|
||
print(f" [ALAMAT MERGE] Merging '{next_text}'")
|
||
if current_addr:
|
||
current_addr += " " + next_text.upper()
|
||
else:
|
||
current_addr = next_text.upper()
|
||
offset += 1 # Lanjut cek baris berikutnya
|
||
else:
|
||
print(f" [ALAMAT SKIP] Line too short '{next_text}'")
|
||
offset += 1 # Skip noise, try next line? Or stop? usually skip noise is safer to continue
|
||
else:
|
||
print(f" [ALAMAT STOP] Hit Stop Condition '{next_text}'")
|
||
break # Stop loop
|
||
|
||
if current_addr:
|
||
result['alamat'] = current_addr
|
||
|
||
if current_addr:
|
||
result['alamat'] = current_addr
|
||
|
||
# ===== RT/RW =====
|
||
# Relaxed pattern to handle -/- or 000/000
|
||
if result['rt_rw'] is None:
|
||
rt_rw_match = re.search(r'(\d{1,3}|-)\s*/\s*(\d{1,3}|-)', text)
|
||
if rt_rw_match:
|
||
result['rt_rw'] = f"{rt_rw_match.group(1)}/{rt_rw_match.group(2)}"
|
||
print(f" [RT/RW] Found {result['rt_rw']}")
|
||
|
||
# ===== KELURAHAN/DESA =====
|
||
if ('kel' in text_lower or 'desa' in text_lower) and 'kelamin' not in text_lower:
|
||
if result['kel_desa'] is None:
|
||
val = self._extract_after_label(text_normalized, 'kel|desa')
|
||
if val:
|
||
result['kel_desa'] = val.upper()
|
||
elif i + 1 < len(texts):
|
||
result['kel_desa'] = texts[i+1].strip().upper()
|
||
|
||
# ===== TEMPAT/TANGGAL LAHIR =====
|
||
# Gunakan _is_label_match untuk fleksibilitas (e.g. Tempat/Tgl Lahir, Tmpt Lahir)
|
||
if result['tempat_lahir'] is None and self._is_label_match(text, 'ttl'):
|
||
print(f" [TTL DEBUG] Matched Label on line {i}: '{text}'")
|
||
# Regex pattern yang SANGAT fleksibel untuk label TTL
|
||
# Menangani berbagai variasi: Tmpat/Tgl Lahir, Tempat. Tgl. Lahir, dll
|
||
# Intinya: T...mp...t <junk> L...hir
|
||
val = self._extract_after_label(text_normalized, r't[ea]m?p?a?t.*?l[a@]hi?r?|tgl.*?l[a@]hi?r?')
|
||
|
||
# Jika val kosong, coba ambil dari baris berikutnya
|
||
if not val and i + 1 < len(texts):
|
||
next_text = texts[i+1].strip()
|
||
next_lower = next_text.lower()
|
||
stop_keywords = ['jenis', 'kelamin', 'alamat', 'gol', 'darah']
|
||
if not any(kw in next_lower for kw in stop_keywords):
|
||
val = next_text.upper()
|
||
print(f" [TTL DEBUG] Took next line: '{val}'")
|
||
|
||
if val:
|
||
print(f" [TTL DEBUG] Parsing value: '{val}'")
|
||
self._parse_ttl(val, result)
|
||
if result['tanggal_lahir']:
|
||
print(f" [TTL DEBUG] Success: {result['tanggal_lahir']}")
|
||
|
||
# ===== JENIS KELAMIN =====
|
||
if result['jenis_kelamin'] is None:
|
||
# 1. Coba cari Label dulu
|
||
if self._is_label_match(text, 'jenis_kelamin'):
|
||
val = self._extract_after_label(text_normalized, r'j[ea]ni?s\s*k[ea]l[a@]?mi?n')
|
||
if val:
|
||
if 'LAKI' in val.upper(): result['jenis_kelamin'] = 'LAKI-LAKI'
|
||
elif 'PEREMPUAN' in val.upper() or 'WANITA' in val.upper(): result['jenis_kelamin'] = 'PEREMPUAN'
|
||
|
||
if result['jenis_kelamin'] is None and i + 1 < len(texts):
|
||
next_text = texts[i+1].upper()
|
||
if 'LAKI' in next_text: result['jenis_kelamin'] = 'LAKI-LAKI'
|
||
elif 'PEREMPUAN' in next_text or 'WANITA' in next_text: result['jenis_kelamin'] = 'PEREMPUAN'
|
||
|
||
# 2. Fallback: Cari langsung keyword VALUES
|
||
if result['jenis_kelamin'] is None:
|
||
text_upper = text.upper()
|
||
if 'LAKI-LAKI' in text_upper or 'LAKI - LAKI' in text_upper:
|
||
result['jenis_kelamin'] = 'LAKI-LAKI'
|
||
elif 'PEREMPUAN' in text_upper:
|
||
result['jenis_kelamin'] = 'PEREMPUAN'
|
||
|
||
# ===== GOLONGAN DARAH =====
|
||
if result['gol_darah'] is None:
|
||
# Cek label
|
||
if self._is_label_match(text, 'gol_darah'):
|
||
val = self._extract_after_label(text_normalized, r'g?o?l\.?\s*d?a?r?a?h')
|
||
# Jika label ketemu tapi val kosong, mungkin nempel (Gol.Darah : O)
|
||
# atau ada di baris ini
|
||
if val:
|
||
gd_match = re.search(r'([ABO]{1,2}[+\-]?)', val)
|
||
if gd_match:
|
||
result['gol_darah'] = gd_match.group(1).upper()
|
||
else:
|
||
# Coba cari pattern gol darah di baris yang sama dengan label
|
||
gd_match = re.search(r'([ABO]{1,2}[+\-]?)', text.upper().replace('0','O'))
|
||
if gd_match:
|
||
result['gol_darah'] = gd_match.group(1).upper()
|
||
|
||
# Cek next line jika baris ini cuma label "Gol Darah"
|
||
if result['gol_darah'] is None and self._is_label_match(text, 'gol_darah') and i+1 < len(texts):
|
||
next_text = texts[i+1].strip().upper()
|
||
if len(next_text) < 5: # Pendek, asumsi gol darah
|
||
gd_match = re.search(r'([ABO]{1,2}[+\-]?)', next_text)
|
||
if gd_match:
|
||
result['gol_darah'] = gd_match.group(1).upper()
|
||
|
||
# ===== KECAMATAN =====
|
||
if 'kecamatan' in text_lower or ('kec' in text_lower and 'kelamin' not in text_lower):
|
||
if result['kecamatan'] is None:
|
||
val = self._extract_after_label(text_normalized, 'kecamatan|kec')
|
||
if val:
|
||
result['kecamatan'] = val.upper()
|
||
elif i + 1 < len(texts):
|
||
# Value on next line (real KTP pattern)
|
||
next_text = texts[i+1].strip()
|
||
if len(next_text) > 2 and not any(kw in next_text.lower() for kw in ['agama', 'status', 'pekerjaan']):
|
||
result['kecamatan'] = next_text.upper()
|
||
|
||
# ===== AGAMA ===== (dengan fuzzy label matching)
|
||
if self._is_label_match(text, 'agama'):
|
||
val = self._extract_after_label(text_normalized, r'a?g{0,2}a?m{0,2}a')
|
||
if val and result['agama'] is None:
|
||
result['agama'] = val.upper()
|
||
elif result['agama'] is None and i + 1 < len(texts):
|
||
# Value on next line (real KTP pattern)
|
||
next_text = texts[i+1].strip().upper()
|
||
if next_text in ['ISLAM', 'KRISTEN', 'KATOLIK', 'HINDU', 'BUDHA', 'BUDDHA', 'KONGHUCU']:
|
||
result['agama'] = next_text
|
||
else:
|
||
# Check if line contains only agama name
|
||
for agama in self.AGAMA_LIST:
|
||
if agama in text_lower and len(text) < 20:
|
||
if result['agama'] is None:
|
||
result['agama'] = text.strip().upper()
|
||
break
|
||
|
||
# ===== STATUS PERKAWINAN =====
|
||
if 'kawin' in text_lower:
|
||
if result['status_perkawinan'] is None:
|
||
# Check against official list first
|
||
text_upper = text.upper().replace("BELUMKAWIN", "BELUM KAWIN")
|
||
for status in self.STATUS_PERKAWINAN_LIST:
|
||
if status in text_upper:
|
||
result['status_perkawinan'] = status
|
||
break
|
||
|
||
# Fallback to extraction if not found in list
|
||
if result['status_perkawinan'] is None:
|
||
val = self._extract_after_label(text_normalized, 'status.*kawin|perkawinan')
|
||
if val:
|
||
result['status_perkawinan'] = val.upper()
|
||
|
||
# ===== PEKERJAAN =====
|
||
if 'pekerjaan' in text_lower:
|
||
val = self._extract_after_label(text_normalized, 'pekerjaan')
|
||
if val and result['pekerjaan'] is None:
|
||
result['pekerjaan'] = val.upper()
|
||
elif result['pekerjaan'] is None and i + 1 < len(texts):
|
||
# Value on next line (real KTP pattern)
|
||
next_text = texts[i+1].strip()
|
||
if len(next_text) > 2 and 'kewarganegaraan' not in next_text.lower():
|
||
result['pekerjaan'] = next_text.upper()
|
||
else:
|
||
# Check if line contains pekerjaan keyword
|
||
for pekerjaan in self.PEKERJAAN_LIST:
|
||
if pekerjaan in text_lower and len(text) < 30:
|
||
if result['pekerjaan'] is None:
|
||
result['pekerjaan'] = text.strip().upper()
|
||
break
|
||
|
||
# ===== KEWARGANEGARAAN =====
|
||
if 'wni' in text_lower:
|
||
result['kewarganegaraan'] = 'WNI'
|
||
elif 'wna' in text_lower:
|
||
result['kewarganegaraan'] = 'WNA'
|
||
elif 'warga' in text_lower and result['kewarganegaraan'] is None:
|
||
val = self._extract_after_label(text_normalized, 'kewarganegaraan|warga')
|
||
if val:
|
||
result['kewarganegaraan'] = val.upper()
|
||
|
||
# ===== BERLAKU HINGGA =====
|
||
if 'berlaku' in text_lower or 'seumur' in text_lower:
|
||
if result['berlaku_hingga'] is None:
|
||
if 'seumur' in text_lower or 'hidup' in text_lower:
|
||
result['berlaku_hingga'] = 'SEUMUR HIDUP'
|
||
else:
|
||
val = self._extract_after_label(text_normalized, 'berlaku')
|
||
if val:
|
||
result['berlaku_hingga'] = val.upper()
|
||
|
||
# ===== TANGGAL PENERBITAN (biasanya format DD-MM-YYYY di akhir) =====
|
||
# Look for date that is NOT tanggal lahir (different date)
|
||
if result['tanggal_penerbitan'] is None:
|
||
# 1. Skip if contains Keywords of other date fields
|
||
# Jangan ambil jika ada kata 'LAHIR', 'TGL', 'BERLAKU', 'SEUMUR', 'HINGGA'
|
||
line_clean = text.lower()
|
||
if any(kw in line_clean for kw in ['lahir', 'lahlr', 'tgl', 'tempat', 'berlaku', 'seumur', 'hingga', 'hidup']):
|
||
pass # Skip
|
||
else:
|
||
# Match date format at end of text or standalone date
|
||
date_match = re.search(r'(\d{2}[-\s/]\d{2}[-\s/]\d{4})$', text.strip())
|
||
if date_match:
|
||
found_date = date_match.group(1).replace(' ', '-')
|
||
# Make sure it's not the same as tanggal_lahir
|
||
if result['tanggal_lahir'] != found_date:
|
||
# Strict Position Check: MUST be in the bottom 30% of lines
|
||
# (Untuk menghindari salah ambil tanggal lahir yg mungkin gagal diparsing sbg TTL)
|
||
if i > len(texts) * 0.7:
|
||
result['tanggal_penerbitan'] = found_date
|
||
print(f" [TGL TERBIT] Found '{found_date}' at index {i}/{len(texts)}")
|
||
else:
|
||
print(f" [TGL TERBIT SKIP] Date '{found_date}' is too high ({i}/{len(texts)})")
|
||
|
||
# ============================================
|
||
# AGGRESSIVE SCAN: Cari agama dari semua teks OCR
|
||
# ============================================
|
||
# Indonesia hanya punya 6 agama resmi, mudah dideteksi
|
||
if result['agama'] is None:
|
||
# Daftar agama dengan variasi penulisan
|
||
agama_patterns = {
|
||
'ISLAM': ['ISLAM', 'ISLM', 'ISIAM', 'ISLAMI'],
|
||
'KRISTEN': ['KRISTEN', 'KRISTEN PROTESTAN', 'PROTESTAN', 'KRISTN'],
|
||
'KATOLIK': ['KATOLIK', 'KATHOLIK', 'KATHOLK', 'KATOLIK ROMA', 'KATOLIK.'],
|
||
'HINDU': ['HINDU', 'HNDU', 'HINDU DHARMA', 'HINDHU'],
|
||
'BUDDHA': ['BUDDHA', 'BUDHA', 'BUDDA', 'BUDDHIS'],
|
||
'KONGHUCU': ['KONGHUCU', 'KHONGHUCU', 'KONGHUCHU', 'CONFUCIUS'],
|
||
}
|
||
|
||
for text in texts:
|
||
text_upper = text.upper().strip()
|
||
# Skip jika teks terlalu pendek atau terlalu panjang
|
||
if len(text_upper) < 4 or len(text_upper) > 30:
|
||
continue
|
||
|
||
for agama_std, variants in agama_patterns.items():
|
||
for variant in variants:
|
||
if variant in text_upper:
|
||
result['agama'] = agama_std
|
||
print(f" [AGAMA SCAN] Found '{variant}' in '{text_upper}' -> {agama_std}")
|
||
break
|
||
if result['agama']:
|
||
break
|
||
if result['agama']:
|
||
break
|
||
|
||
# ============================================
|
||
# AGGRESSIVE SCAN: Cari golongan darah dari semua teks OCR
|
||
# ============================================
|
||
# Golongan darah hanya 4: A, B, AB, O (dengan/tanpa rhesus +/-)
|
||
if result['gol_darah'] is None:
|
||
gol_darah_patterns = ['AB+', 'AB-', 'A+', 'A-', 'B+', 'B-', 'O+', 'O-', 'AB', 'A', 'B', 'O']
|
||
|
||
for text in texts:
|
||
text_upper = text.upper().strip()
|
||
# Hapus punctuation umum
|
||
text_clean = re.sub(r'[:\.\,\s]+', '', text_upper)
|
||
# Konversi 0 (nol) menjadi O (huruf) - OCR sering salah baca
|
||
text_clean = text_clean.replace('0', 'O')
|
||
|
||
# Skip jika teks terlalu panjang (bukan gol darah)
|
||
if len(text_clean) > 10:
|
||
continue
|
||
|
||
# Cari match untuk gol darah (dari panjang ke pendek untuk prioritas AB sebelum A/B)
|
||
for gol in gol_darah_patterns:
|
||
# Exact match setelah dibersihkan
|
||
if text_clean == gol:
|
||
result['gol_darah'] = gol
|
||
print(f" [GOL DARAH SCAN] Found '{text_upper}' -> {gol}")
|
||
break
|
||
# Match dengan prefix GOL
|
||
if text_clean == f"GOL{gol}" or text_clean == f"GOLDARAH{gol}":
|
||
result['gol_darah'] = gol
|
||
print(f" [GOL DARAH SCAN] Found '{text_upper}' -> {gol}")
|
||
break
|
||
# Match sebagai single character di akhir teks pendek
|
||
if len(text_clean) <= 3 and text_clean.endswith(gol):
|
||
result['gol_darah'] = gol
|
||
print(f" [GOL DARAH SCAN] Found '{text_upper}' -> {gol}")
|
||
break
|
||
|
||
if result['gol_darah']:
|
||
break
|
||
|
||
# ============================================
|
||
# AGGRESSIVE SCAN: Cari berlaku hingga dari semua teks OCR
|
||
# ============================================
|
||
if result['berlaku_hingga'] is None:
|
||
for text in texts:
|
||
text_upper = text.upper().strip()
|
||
if 'SEUMUR' in text_upper or 'HIDUP' in text_upper:
|
||
result['berlaku_hingga'] = 'SEUMUR HIDUP'
|
||
print(f" [BERLAKU SCAN] Found '{text_upper}' -> SEUMUR HIDUP")
|
||
break
|
||
|
||
# Post-processing
|
||
result = self._post_process(result)
|
||
|
||
return result
|
||
|
||
def _extract_after_label(self, text: str, label_pattern: str) -> Optional[str]:
|
||
"""Ekstrak nilai setelah label (supports various separators)"""
|
||
patterns = [
|
||
rf'(?:{label_pattern})\s*:\s*(.+)', # label: value
|
||
rf'(?:{label_pattern})\s+([A-Z0-9].+)', # label VALUE (uppercase start)
|
||
]
|
||
|
||
for pattern in patterns:
|
||
match = re.search(pattern, text, re.IGNORECASE)
|
||
if match:
|
||
value = match.group(1).strip()
|
||
# Remove trailing colon or label fragment
|
||
value = re.sub(r'^[:\s]+', '', value)
|
||
value = re.sub(r'\s*:\s*$', '', value)
|
||
if value and len(value) > 1:
|
||
return value
|
||
|
||
return None
|
||
|
||
def _parse_ttl(self, ttl_text: str, result: Dict):
|
||
"""Parse tempat/tanggal lahir dari text"""
|
||
ttl_text = ttl_text.strip()
|
||
|
||
# Normalize dates where OCR missed dashes:
|
||
# "05 08 1978" -> "05-08-1978"
|
||
# "05 08-1978" -> "05-08-1978"
|
||
# "05-08 1978" -> "05-08-1978"
|
||
ttl_text = re.sub(r'(\d{2})[\s]+(\d{2})[\s]+(\d{4})', r'\1-\2-\3', ttl_text)
|
||
ttl_text = re.sub(r'(\d{2})[\s]+(\d{2})[-/](\d{4})', r'\1-\2-\3', ttl_text)
|
||
ttl_text = re.sub(r'(\d{2})[-/](\d{2})[\s]+(\d{4})', r'\1-\2-\3', ttl_text)
|
||
|
||
# Handle "0508-1978" -> "05-08-1978" (Missing separator between day/month)
|
||
ttl_text = re.sub(r'(\d{2})(\d{2})[-/](\d{4})', r'\1-\2-\3', ttl_text)
|
||
|
||
# Handle 8-digit date without separator: "05081978" -> "05-08-1978"
|
||
date_8digit = re.search(r'(\d{8})', ttl_text)
|
||
if date_8digit:
|
||
d = date_8digit.group(1)
|
||
formatted = f"{d[:2]}-{d[2:4]}-{d[4:]}"
|
||
ttl_text = ttl_text.replace(d, formatted)
|
||
|
||
# Handle merged city+date like "JAKARTA05-08-1978" - add space before digits
|
||
ttl_text = re.sub(r'([A-Z])(\d{2}[-/])', r'\1 \2', ttl_text, flags=re.IGNORECASE)
|
||
# Handle merged city+date like "JAKARTA.05-08-1978" -> replace dot with space
|
||
ttl_text = re.sub(r'([A-Z])\.(\d)', r'\1 \2', ttl_text, flags=re.IGNORECASE)
|
||
|
||
# Format: "TEMPAT, DD-MM-YYYY" atau "TEMPAT DD-MM-YYYY"
|
||
date_match = re.search(r'(\d{2}[-/]\d{2}[-/]\d{4})', ttl_text)
|
||
if date_match:
|
||
result['tanggal_lahir'] = date_match.group(1)
|
||
# Tempat adalah bagian sebelum tanggal
|
||
place = ttl_text[:date_match.start()].strip(' ,:-/.')
|
||
# Clean up label remnants
|
||
place = re.sub(r'^(tempat|tgl|lahir|:|:)[/\s::]*', '', place, flags=re.IGNORECASE).strip()
|
||
if place and len(place) > 2:
|
||
result['tempat_lahir'] = place.upper()
|
||
else:
|
||
# Coba split by comma
|
||
parts = ttl_text.split(',')
|
||
if len(parts) >= 2:
|
||
result['tempat_lahir'] = parts[0].strip().upper()
|
||
result['tanggal_lahir'] = parts[1].strip()
|
||
elif len(parts) == 1 and len(ttl_text) > 2:
|
||
result['tempat_lahir'] = ttl_text.upper()
|
||
|
||
def _post_process(self, result: Dict) -> Dict:
|
||
"""Post-processing hasil ekstraksi"""
|
||
# Validasi NIK (harus 16 digit)
|
||
if result['nik'] and not re.match(r'^\d{16}$', result['nik']):
|
||
cleaned = re.sub(r'\D', '', result['nik'])
|
||
if len(cleaned) == 16:
|
||
result['nik'] = cleaned
|
||
else:
|
||
result['nik'] = None
|
||
|
||
# Fix format tanggal lahir yang salah
|
||
# Pattern: DDMM-YYYY (contoh: 1608-1976) -> DD-MM-YYYY (16-08-1976)
|
||
if result['tanggal_lahir']:
|
||
tl = result['tanggal_lahir']
|
||
# Match DDMM-YYYY format (salah)
|
||
wrong_format = re.match(r'^(\d{2})(\d{2})-(\d{4})$', tl)
|
||
if wrong_format:
|
||
result['tanggal_lahir'] = f"{wrong_format.group(1)}-{wrong_format.group(2)}-{wrong_format.group(3)}"
|
||
print(f" [DATE FIX] '{tl}' -> '{result['tanggal_lahir']}'")
|
||
# Match DDMMYYYY format (tanpa separator)
|
||
no_sep_format = re.match(r'^(\d{2})(\d{2})(\d{4})$', tl)
|
||
if no_sep_format:
|
||
result['tanggal_lahir'] = f"{no_sep_format.group(1)}-{no_sep_format.group(2)}-{no_sep_format.group(3)}"
|
||
print(f" [DATE FIX] '{tl}' -> '{result['tanggal_lahir']}'")
|
||
|
||
# Clean all string values - remove leading colons and extra whitespace
|
||
for field in result:
|
||
if result[field] and isinstance(result[field], str):
|
||
val = result[field]
|
||
# Remove leading colons (standard and full-width)
|
||
val = re.sub(r'^[\s::]+', '', val)
|
||
# Remove trailing colons
|
||
val = re.sub(r'[\s::]+$', '', val)
|
||
# Remove double spaces
|
||
val = re.sub(r'\s+', ' ', val)
|
||
result[field] = val.strip()
|
||
|
||
# Bersihkan label dari values
|
||
for field in ['nama', 'alamat', 'tempat_lahir', 'kel_desa', 'kecamatan', 'agama', 'pekerjaan']:
|
||
if result[field]:
|
||
# Remove common labels yang ter-capture
|
||
result[field] = re.sub(
|
||
r'^(NAMA|ALAMAT|TEMPAT|LAHIR|TGL|KEL|DESA|KELURAHAN|KECAMATAN|KEC|AGAMA|PEKERJAAN|STATUS)[\s::]*',
|
||
'', result[field], flags=re.IGNORECASE
|
||
).strip()
|
||
|
||
# Fix status perkawinan yang masih mengandung label
|
||
if result['status_perkawinan']:
|
||
sp = result['status_perkawinan']
|
||
sp = re.sub(r'^(STATUS|PERKAWINAN)[\s::]*', '', sp, flags=re.IGNORECASE).strip()
|
||
result['status_perkawinan'] = sp
|
||
|
||
# Fix berlaku hingga
|
||
if result['berlaku_hingga']:
|
||
bh = result['berlaku_hingga']
|
||
bh = re.sub(r'^(BERLAKU|HINGGA)[\s::]*', '', bh, flags=re.IGNORECASE).strip()
|
||
if bh.upper() == 'HIDUP' or 'SEUMUR' in bh.upper():
|
||
result['berlaku_hingga'] = 'SEUMUR HIDUP'
|
||
else:
|
||
result['berlaku_hingga'] = bh
|
||
else:
|
||
# Fallback: Sesuai peraturan pemerintah, e-KTP berlaku seumur hidup
|
||
# Berlaku untuk e-KTP yang diterbitkan sejak 2011
|
||
result['berlaku_hingga'] = 'SEUMUR HIDUP'
|
||
print(" [FALLBACK] berlaku_hingga = SEUMUR HIDUP (peraturan pemerintah)")
|
||
|
||
# ============================================
|
||
# Parse nama Bali jika terdeteksi
|
||
# ============================================
|
||
# Deteksi apakah ini KTP Bali berdasarkan:
|
||
# 1. Provinsi = BALI
|
||
# 2. NIK dimulai dengan 51 (kode Bali)
|
||
# 3. Nama mengandung komponen khas Bali (NI, I GUSTI, dll)
|
||
is_bali = False
|
||
if result.get('provinsi') and 'BALI' in result['provinsi'].upper():
|
||
is_bali = True
|
||
elif result.get('nik') and result['nik'].startswith('51'):
|
||
is_bali = True
|
||
elif result.get('nama'):
|
||
nama_upper = result['nama'].upper()
|
||
# Cek apakah nama dimulai dengan prefix Bali
|
||
if nama_upper.startswith('NI') or nama_upper.startswith('IGUSTI') or \
|
||
nama_upper.startswith('IDABAGUS') or nama_upper.startswith('IDAAYU') or \
|
||
any(nama_upper.startswith(p) for p in ['GUSTI', 'WAYAN', 'MADE', 'NYOMAN', 'KETUT', 'PUTU', 'KADEK', 'KOMANG']):
|
||
is_bali = True
|
||
|
||
if is_bali and result.get('nama'):
|
||
result['nama'] = self._parse_balinese_name(result['nama'])
|
||
|
||
# ============================================
|
||
# Validasi dan koreksi Agama
|
||
# ============================================
|
||
if result.get('agama'):
|
||
agama = result['agama'].upper().strip()
|
||
# Fuzzy match terhadap daftar agama valid
|
||
agama_match = None
|
||
best_ratio = 0
|
||
for valid_agama in self.AGAMA_LIST:
|
||
ratio = difflib.SequenceMatcher(None, agama, valid_agama.upper()).ratio()
|
||
if ratio > best_ratio and ratio > 0.6:
|
||
best_ratio = ratio
|
||
agama_match = valid_agama.upper()
|
||
|
||
if agama_match:
|
||
if agama_match != agama:
|
||
print(f" [AGAMA VALIDATE] '{agama}' -> '{agama_match}' (ratio={best_ratio:.2f})")
|
||
result['agama'] = agama_match
|
||
# Tidak ada fallback otomatis untuk agama - harus dari OCR
|
||
|
||
# Fix merged kabupaten/kota names (e.g., JAKARTASELATAN -> JAKARTA SELATAN)
|
||
if result['kabupaten_kota']:
|
||
kk = result['kabupaten_kota']
|
||
# Add space before directional words
|
||
kk = re.sub(r'(JAKARTA|BANDUNG|SURABAYA|SEMARANG|MEDAN|BEKASI|TANGERANG|DEPOK|BOGOR)(SELATAN|UTARA|BARAT|TIMUR|PUSAT|TENGAH)',
|
||
r'\1 \2', kk, flags=re.IGNORECASE)
|
||
# Common merged patterns
|
||
kk = re.sub(r'(KOTA|KABUPATEN)([A-Z])', r'\1 \2', kk, flags=re.IGNORECASE)
|
||
result['kabupaten_kota'] = kk.upper()
|
||
|
||
# Fix merged provinsi names
|
||
if result['provinsi']:
|
||
prov = result['provinsi']
|
||
prov = re.sub(r'(DKI|DI)(JAKARTA|YOGYAKARTA)', r'\1 \2', prov, flags=re.IGNORECASE)
|
||
prov = re.sub(r'(JAWA|KALIMANTAN|SULAWESI|SUMATERA|NUSA TENGGARA)(BARAT|TIMUR|TENGAH|SELATAN|UTARA)',
|
||
r'\1 \2', prov, flags=re.IGNORECASE)
|
||
result['provinsi'] = prov.upper()
|
||
|
||
# Fix merged alamat/address (e.g., JLKECAPIV -> JL KECAPI V)
|
||
if result['alamat']:
|
||
alamat = result['alamat']
|
||
# Add space after common street prefixes
|
||
alamat = re.sub(r'^(JL|JLN|JALAN|GG|GANG|NO|BLOK)([A-Z])', r'\1 \2', alamat, flags=re.IGNORECASE)
|
||
# Add space before Roman numerals at the end (I, II, III, IV, V, VI, VII, VIII, IX, X)
|
||
alamat = re.sub(r'([A-Z])([IVX]+)$', r'\1 \2', alamat, flags=re.IGNORECASE)
|
||
# Add space before single digits/numbers at end
|
||
alamat = re.sub(r'([A-Z])(\d+)$', r'\1 \2', alamat, flags=re.IGNORECASE)
|
||
# Fix common patterns: "NO123" -> "NO 123", "BLOKA" -> "BLOK A"
|
||
alamat = re.sub(r'\b(NO|BLOK)(\d+|[A-Z])\b', r'\1 \2', alamat, flags=re.IGNORECASE)
|
||
result['alamat'] = alamat.upper()
|
||
|
||
# ============================================
|
||
# Cross-validation: Tempat Lahir vs Kel/Desa
|
||
# ============================================
|
||
# Pada KTP, tempat lahir sering sama dengan desa/kelurahan
|
||
# Jika tempat_lahir mirip dengan kel_desa, gunakan yang tervalidasi
|
||
if result.get('tempat_lahir') and result.get('kel_desa'):
|
||
tl = result['tempat_lahir'].upper()
|
||
kd = result['kel_desa'].upper()
|
||
|
||
# Hitung similarity
|
||
ratio = difflib.SequenceMatcher(None, tl, kd).ratio()
|
||
|
||
if ratio > 0.7:
|
||
# Tempat lahir mirip dengan kel/desa, gunakan kel/desa yang sudah divalidasi
|
||
print(f" [CROSS-VALIDATE] Tempat Lahir '{tl}' mirip dengan Kel/Desa '{kd}' (ratio={ratio:.2f})")
|
||
result['tempat_lahir'] = kd
|
||
elif ratio > 0.5:
|
||
# Cukup mirip, log untuk debugging
|
||
print(f" [CROSS-VALIDATE] Tempat Lahir '{tl}' mungkin sama dengan Kel/Desa '{kd}' (ratio={ratio:.2f})")
|
||
|
||
# Jika tempat_lahir kosong tapi kel_desa ada, mungkin sama
|
||
# (tidak otomatis mengisi karena bisa beda)
|
||
|
||
return result
|
||
|
||
|
||
if __name__ == "__main__":
|
||
# Test
|
||
sample_ocr = [
|
||
{'text': 'PROVINSI JAWA BARAT'},
|
||
{'text': 'KABUPATEN BANDUNG'},
|
||
{'text': 'NIK : 3204012345678901'},
|
||
{'text': 'Nama : JOHN DOE'},
|
||
{'text': 'Tempat/Tgl Lahir : BANDUNG, 01-01-1990'},
|
||
{'text': 'Jenis Kelamin : LAKI-LAKI'},
|
||
{'text': 'Alamat : JL. MERDEKA NO. 123'},
|
||
{'text': 'RT/RW : 001/002'},
|
||
{'text': 'Kel/Desa : SUKAMAJU'},
|
||
{'text': 'Kecamatan : SUKASARI'},
|
||
{'text': 'Agama : ISLAM'},
|
||
{'text': 'Status Perkawinan : BELUM KAWIN'},
|
||
{'text': 'Pekerjaan : KARYAWAN SWASTA'},
|
||
{'text': 'Kewarganegaraan : WNI'},
|
||
{'text': 'Berlaku Hingga : SEUMUR HIDUP'},
|
||
]
|
||
|
||
extractor = KTPExtractor()
|
||
result = extractor.extract(sample_ocr)
|
||
|
||
for key, value in result.items():
|
||
print(f"{key}: {value}")
|