Files
local-ocr/ktp_extractor.py
2025-12-28 01:20:37 +08:00

603 lines
28 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
KTP Field Extractor
Ekstraksi data terstruktur dari hasil OCR KTP Indonesia
Mendukung berbagai format output OCR (full-width colon, standard colon, tanpa colon)
"""
import re
from typing import Dict, Optional, List
class KTPExtractor:
"""Ekstrak field dari hasil OCR KTP"""
# Pattern colon yang berbeda-beda (standard, full-width, dll)
COLON_PATTERN = r'[:\]'
# Keywords untuk jenis kelamin
MALE_KEYWORDS = ['laki', 'pria', 'male']
FEMALE_KEYWORDS = ['perempuan', 'wanita', 'female']
# Agama yang valid
AGAMA_LIST = ['islam', 'kristen', 'katolik', 'hindu', 'budha', 'buddha', 'konghucu']
# Pekerjaan umum
PEKERJAAN_LIST = ['pelajar', 'mahasiswa', 'pegawai', 'swasta', 'pns', 'wiraswasta',
'buruh', 'petani', 'nelayan', 'karyawan', 'ibu rumah tangga',
'tidak bekerja', 'lainnya', 'mengurus rumah tangga']
# KTP Zone Template (normalized coordinates: x_min, y_min, x_max, y_max)
# Based on standard KTP layout
ZONES = {
'header_provinsi': (0.15, 0.00, 0.85, 0.07), # PROVINSI header
'header_kabupaten': (0.15, 0.05, 0.85, 0.13), # KABUPATEN header
'nik': (0.02, 0.10, 0.70, 0.22), # NIK area
'nama': (0.02, 0.18, 0.70, 0.28), # Nama area
'ttl': (0.02, 0.25, 0.70, 0.36), # Tempat/Tgl Lahir
'jenis_kelamin': (0.02, 0.33, 0.45, 0.42), # Jenis Kelamin (left)
'gol_darah': (0.40, 0.33, 0.70, 0.42), # Gol Darah (right of jenis)
'alamat': (0.02, 0.38, 0.70, 0.50), # Alamat
'rt_rw': (0.02, 0.46, 0.70, 0.54), # RT/RW
'kel_desa': (0.02, 0.51, 0.70, 0.60), # Kel/Desa
'kecamatan': (0.02, 0.57, 0.70, 0.66), # Kecamatan
'agama': (0.02, 0.63, 0.70, 0.72), # Agama
'status': (0.02, 0.69, 0.70, 0.78), # Status Perkawinan
'pekerjaan': (0.02, 0.75, 0.70, 0.84), # Pekerjaan
'wni': (0.02, 0.81, 0.70, 0.90), # Kewarganegaraan
'berlaku': (0.02, 0.87, 0.70, 0.96), # Berlaku Hingga
'foto': (0.68, 0.10, 0.98, 0.55), # Foto (right side)
'penerbitan': (0.65, 0.58, 0.98, 0.98), # Tempat & Tanggal penerbitan
}
def __init__(self):
self.image_width = 0
self.image_height = 0
def _get_zone(self, x_center: float, y_center: float, img_width: int, img_height: int) -> Optional[str]:
"""Determine which zone a text belongs to based on normalized coordinates"""
if img_width == 0 or img_height == 0:
return None
# Normalize coordinates
x_norm = x_center / img_width
y_norm = y_center / img_height
for zone_name, (x_min, y_min, x_max, y_max) in self.ZONES.items():
if x_min <= x_norm <= x_max and y_min <= y_norm <= y_max:
return zone_name
return None
def _extract_value_from_text(self, text: str) -> str:
"""Extract value part from label:value text"""
# Split by colon (standard or full-width)
parts = re.split(r'[:]', text, 1)
if len(parts) > 1:
return parts[1].strip()
return text.strip()
def _detect_image_size(self, ocr_results: List[Dict]) -> tuple:
"""Detect image dimensions from bounding boxes"""
max_x, max_y = 0, 0
for r in ocr_results:
bbox = r.get('bbox', [])
if bbox and len(bbox) >= 4:
for point in bbox:
if len(point) >= 2:
max_x = max(max_x, point[0])
max_y = max(max_y, point[1])
# Add some margin
return (int(max_x * 1.05), int(max_y * 1.05)) if max_x > 0 else (1000, 640)
def _extract_by_zones(self, zone_texts: Dict[str, List[str]], result: Dict):
"""Extract fields based on zone assignments"""
# PROVINSI from header
if 'header_provinsi' in zone_texts:
for text in zone_texts['header_provinsi']:
if 'provinsi' in text.lower():
val = re.sub(r'(?i)provinsi\s*', '', text).strip()
if val:
result['provinsi'] = val.upper()
break
# KABUPATEN/KOTA from header
if 'header_kabupaten' in zone_texts:
for text in zone_texts['header_kabupaten']:
text_lower = text.lower()
if 'kabupaten' in text_lower or 'kota' in text_lower:
val = re.sub(r'(?i)(kabupaten|kota)\s*', '', text).strip()
if val:
result['kabupaten_kota'] = val.upper()
else:
result['kabupaten_kota'] = text.upper()
break
# NAMA from nama zone (skip label line)
if 'nama' in zone_texts:
for text in zone_texts['nama']:
text_lower = text.lower()
if 'nama' not in text_lower and len(text) > 2:
result['nama'] = text.upper()
break
elif 'nama' in text_lower:
val = self._extract_value_from_text(text)
if val and 'nama' not in val.lower():
result['nama'] = val.upper()
# TTL from ttl zone
if 'ttl' in zone_texts:
for text in zone_texts['ttl']:
if 'tempat' in text.lower() or 'lahir' in text.lower():
val = self._extract_value_from_text(text)
if val:
self._parse_ttl(val, result)
break
# JENIS KELAMIN
if 'jenis_kelamin' in zone_texts:
for text in zone_texts['jenis_kelamin']:
text_lower = text.lower()
if 'laki' in text_lower:
result['jenis_kelamin'] = 'LAKI-LAKI'
break
elif 'perempuan' in text_lower:
result['jenis_kelamin'] = 'PEREMPUAN'
break
# GOL DARAH
if 'gol_darah' in zone_texts:
for text in zone_texts['gol_darah']:
gol_match = re.search(r'([ABO]{1,2}[+\-]?)', text, re.IGNORECASE)
if gol_match:
result['gol_darah'] = gol_match.group(1).upper()
break
# ALAMAT
if 'alamat' in zone_texts:
for text in zone_texts['alamat']:
if 'alamat' not in text.lower() or len(zone_texts['alamat']) == 1:
val = self._extract_value_from_text(text) if 'alamat' in text.lower() else text
if val and 'alamat' not in val.lower():
result['alamat'] = val.upper()
break
# PENERBITAN area (tempat & tanggal dalam satu zona)
if 'penerbitan' in zone_texts:
for text in zone_texts['penerbitan']:
# Look for date
date_match = re.search(r'(\d{2}[-/]\d{2}[-/]\d{4})', text)
if date_match and result['tanggal_penerbitan'] is None:
result['tanggal_penerbitan'] = date_match.group(1)
def extract(self, ocr_results: List[Dict]) -> Dict[str, Optional[str]]:
"""
Ekstrak field KTP dari hasil OCR dengan template-based zone detection
Args:
ocr_results: List hasil dari OCREngine.extract_text()
Returns:
Dict dengan field KTP
"""
result = {
'nik': None,
'nama': None,
'tempat_lahir': None,
'tanggal_lahir': None,
'jenis_kelamin': None,
'gol_darah': None,
'alamat': None,
'rt_rw': None,
'kel_desa': None,
'kecamatan': None,
'agama': None,
'status_perkawinan': None,
'pekerjaan': None,
'kewarganegaraan': None,
'berlaku_hingga': None,
'provinsi': None,
'kabupaten_kota': None,
'tanggal_penerbitan': None,
}
# Detect image dimensions from bounding boxes
img_width, img_height = self._detect_image_size(ocr_results)
# Assign zones to each OCR result
zone_texts = {} # zone_name -> list of texts
for r in ocr_results:
x_center = r.get('x_center', 0)
y_center = r.get('y_center', 0)
zone = self._get_zone(x_center, y_center, img_width, img_height)
if zone:
if zone not in zone_texts:
zone_texts[zone] = []
zone_texts[zone].append(r['text'])
# Debug: print zone assignments
print("\n[DEBUG KTPExtractor] Zone assignments:")
for zone, texts in zone_texts.items():
print(f" {zone}: {texts}")
# Extract fields using zone-based approach
self._extract_by_zones(zone_texts, result)
# Gabungkan semua teks untuk fallback pattern matching
texts = [r['text'].strip() for r in ocr_results]
all_text = '\n'.join(texts)
# Ekstrak NIK (16 digit) - bisa ada di mana saja
nik_match = re.search(r'\b(\d{16})\b', all_text)
if nik_match:
result['nik'] = nik_match.group(1)
print(f" -> NIK found: {result['nik']}")
# Fallback: Parse line by line for fields not found by zone
for i, text in enumerate(texts):
text_lower = text.lower()
# Normalize colons
text_normalized = re.sub(self.COLON_PATTERN, ':', text)
text_norm_lower = text_normalized.lower()
# ===== PROVINSI =====
if 'provinsi' in text_lower and result['provinsi'] is None:
val = self._extract_after_label(text_normalized, 'provinsi')
if val:
result['provinsi'] = val.upper()
elif i + 1 < len(texts) and 'provinsi' not in texts[i+1].lower():
# Mungkin value di line berikutnya
result['provinsi'] = texts[i+1].strip().upper()
# ===== KABUPATEN/KOTA =====
if ('kabupaten' in text_lower or 'kota' in text_lower or 'jakarta' in text_lower) and result['kabupaten_kota'] is None:
if 'provinsi' not in text_lower: # Bukan bagian dari provinsi
val = self._extract_after_label(text_normalized, 'kabupaten|kota')
if val:
result['kabupaten_kota'] = val.upper()
else:
result['kabupaten_kota'] = text.strip().upper()
# ===== NAMA =====
if 'nama' in text_lower and result['nama'] is None:
val = self._extract_after_label(text_normalized, 'nama')
if val and len(val) > 2:
result['nama'] = val.upper()
elif i + 1 < len(texts):
# Nama di line berikutnya
next_text = texts[i+1].strip()
if len(next_text) > 2 and not any(kw in next_text.lower() for kw in ['tempat', 'lahir', 'jenis']):
result['nama'] = next_text.upper()
# ===== TEMPAT/TANGGAL LAHIR =====
# Match "Tempat/Tgl Lahir" or "Tempat Lahir" or similar labels
if 'tempat' in text_lower or ('lahir' in text_lower and 'berlaku' not in text_lower):
if result['tempat_lahir'] is None or result['tanggal_lahir'] is None:
# Extract value after label using full-width or standard colon
ttl = self._extract_after_label(text_normalized, r'tempat[/\s]*tgl[/\s]*lahir|tempat[/\s]*lahir|lahir')
if ttl:
self._parse_ttl(ttl, result)
elif '' in text or ':' in text:
# Value is after colon but _extract_after_label didn't catch it
parts = re.split(r'[:]', text, 1)
if len(parts) > 1 and parts[1].strip():
self._parse_ttl(parts[1].strip(), result)
elif i + 1 < len(texts):
# TTL di line berikutnya
next_text = texts[i+1].strip()
if not any(kw in next_text.lower() for kw in ['jenis', 'kelamin', 'alamat', 'gol']):
self._parse_ttl(next_text, result)
# ===== JENIS KELAMIN =====
if any(kw in text_lower for kw in self.MALE_KEYWORDS):
if result['jenis_kelamin'] is None:
result['jenis_kelamin'] = 'LAKI-LAKI'
elif any(kw in text_lower for kw in self.FEMALE_KEYWORDS):
if result['jenis_kelamin'] is None:
result['jenis_kelamin'] = 'PEREMPUAN'
# ===== GOLONGAN DARAH =====
if 'darah' in text_lower or 'gol.' in text_lower:
# Try to find blood type on same line
gol_match = re.search(r'(?:gol|darah)[.\s:]*([ABO]{1,2}[+\-]?)', text, re.IGNORECASE)
if gol_match and result['gol_darah'] is None:
result['gol_darah'] = gol_match.group(1).upper()
elif result['gol_darah'] is None and i + 1 < len(texts):
# Blood type might be on next line (real KTP pattern)
next_text = texts[i+1].strip()
if re.match(r'^[ABO]{1,2}[+\-]?$', next_text, re.IGNORECASE):
result['gol_darah'] = next_text.upper()
# Standalone blood type (e.g., just "O" or "A+" on its own line)
if result['gol_darah'] is None:
if re.match(r'^[ABO]{1,2}[+\-]?$', text.strip(), re.IGNORECASE) and len(text.strip()) <= 3:
result['gol_darah'] = text.strip().upper()
# ===== ALAMAT =====
if 'alamat' in text_lower and result['alamat'] is None:
val = self._extract_after_label(text_normalized, 'alamat')
if val:
result['alamat'] = val.upper()
elif i + 1 < len(texts):
result['alamat'] = texts[i+1].strip().upper()
# ===== RT/RW =====
rt_rw_match = re.search(r'(\d{3})\s*/\s*(\d{3})', text)
if rt_rw_match:
result['rt_rw'] = f"{rt_rw_match.group(1)}/{rt_rw_match.group(2)}"
# ===== KELURAHAN/DESA =====
if ('kel' in text_lower or 'desa' in text_lower) and 'kelamin' not in text_lower:
if result['kel_desa'] is None:
val = self._extract_after_label(text_normalized, 'kel|desa')
if val:
result['kel_desa'] = val.upper()
elif i + 1 < len(texts):
result['kel_desa'] = texts[i+1].strip().upper()
# ===== KECAMATAN =====
if 'kecamatan' in text_lower or ('kec' in text_lower and 'kelamin' not in text_lower):
if result['kecamatan'] is None:
val = self._extract_after_label(text_normalized, 'kecamatan|kec')
if val:
result['kecamatan'] = val.upper()
elif i + 1 < len(texts):
# Value on next line (real KTP pattern)
next_text = texts[i+1].strip()
if len(next_text) > 2 and not any(kw in next_text.lower() for kw in ['agama', 'status', 'pekerjaan']):
result['kecamatan'] = next_text.upper()
# ===== AGAMA =====
if 'agama' in text_lower:
val = self._extract_after_label(text_normalized, 'agama')
if val and result['agama'] is None:
result['agama'] = val.upper()
elif result['agama'] is None and i + 1 < len(texts):
# Value on next line (real KTP pattern)
next_text = texts[i+1].strip().upper()
if next_text in ['ISLAM', 'KRISTEN', 'KATOLIK', 'HINDU', 'BUDHA', 'BUDDHA', 'KONGHUCU']:
result['agama'] = next_text
else:
# Check if line contains only agama name
for agama in self.AGAMA_LIST:
if agama in text_lower and len(text) < 20:
if result['agama'] is None:
result['agama'] = text.strip().upper()
break
# ===== STATUS PERKAWINAN =====
if 'kawin' in text_lower:
if result['status_perkawinan'] is None:
val = self._extract_after_label(text_normalized, 'status.*kawin|perkawinan')
if val:
result['status_perkawinan'] = val.upper()
elif 'belum' in text_lower:
result['status_perkawinan'] = 'BELUM KAWIN'
elif 'kawin' in text_lower and 'cerai' not in text_lower:
result['status_perkawinan'] = 'KAWIN'
elif 'cerai hidup' in text_lower:
result['status_perkawinan'] = 'CERAI HIDUP'
elif 'cerai mati' in text_lower:
result['status_perkawinan'] = 'CERAI MATI'
# ===== PEKERJAAN =====
if 'pekerjaan' in text_lower:
val = self._extract_after_label(text_normalized, 'pekerjaan')
if val and result['pekerjaan'] is None:
result['pekerjaan'] = val.upper()
elif result['pekerjaan'] is None and i + 1 < len(texts):
# Value on next line (real KTP pattern)
next_text = texts[i+1].strip()
if len(next_text) > 2 and 'kewarganegaraan' not in next_text.lower():
result['pekerjaan'] = next_text.upper()
else:
# Check if line contains pekerjaan keyword
for pekerjaan in self.PEKERJAAN_LIST:
if pekerjaan in text_lower and len(text) < 30:
if result['pekerjaan'] is None:
result['pekerjaan'] = text.strip().upper()
break
# ===== KEWARGANEGARAAN =====
if 'wni' in text_lower:
result['kewarganegaraan'] = 'WNI'
elif 'wna' in text_lower:
result['kewarganegaraan'] = 'WNA'
elif 'warga' in text_lower and result['kewarganegaraan'] is None:
val = self._extract_after_label(text_normalized, 'kewarganegaraan|warga')
if val:
result['kewarganegaraan'] = val.upper()
# ===== BERLAKU HINGGA =====
if 'berlaku' in text_lower or 'seumur' in text_lower:
if result['berlaku_hingga'] is None:
if 'seumur' in text_lower or 'hidup' in text_lower:
result['berlaku_hingga'] = 'SEUMUR HIDUP'
else:
val = self._extract_after_label(text_normalized, 'berlaku')
if val:
result['berlaku_hingga'] = val.upper()
# ===== TANGGAL PENERBITAN (biasanya format DD-MM-YYYY di akhir) =====
# Look for date that is NOT tanggal lahir (different date)
if result['tanggal_penerbitan'] is None:
# Match date format at end of text or standalone date
date_match = re.search(r'(\d{2}[-/]\d{2}[-/]\d{4})$', text.strip())
if date_match:
found_date = date_match.group(1)
# Make sure it's not the same as tanggal_lahir
if result['tanggal_lahir'] != found_date:
# Likely penerbitan if after berlaku_hingga was found
if result['berlaku_hingga'] or i > len(texts) * 0.7:
result['tanggal_penerbitan'] = found_date
# Post-processing
result = self._post_process(result)
return result
def _extract_after_label(self, text: str, label_pattern: str) -> Optional[str]:
"""Ekstrak nilai setelah label (supports various separators)"""
patterns = [
rf'(?:{label_pattern})\s*:\s*(.+)', # label: value
rf'(?:{label_pattern})\s+([A-Z0-9].+)', # label VALUE (uppercase start)
]
for pattern in patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
value = match.group(1).strip()
# Remove trailing colon or label fragment
value = re.sub(r'^[:\s]+', '', value)
value = re.sub(r'\s*:\s*$', '', value)
if value and len(value) > 1:
return value
return None
def _parse_ttl(self, ttl_text: str, result: Dict):
"""Parse tempat/tanggal lahir dari text"""
ttl_text = ttl_text.strip()
# Normalize dates where OCR missed dashes:
# "05 08 1978" -> "05-08-1978"
# "05 08-1978" -> "05-08-1978"
# "05-08 1978" -> "05-08-1978"
ttl_text = re.sub(r'(\d{2})[\s]+(\d{2})[\s]+(\d{4})', r'\1-\2-\3', ttl_text)
ttl_text = re.sub(r'(\d{2})[\s]+(\d{2})[-/](\d{4})', r'\1-\2-\3', ttl_text)
ttl_text = re.sub(r'(\d{2})[-/](\d{2})[\s]+(\d{4})', r'\1-\2-\3', ttl_text)
# Handle 8-digit date without separator: "05081978" -> "05-08-1978"
date_8digit = re.search(r'(\d{8})', ttl_text)
if date_8digit:
d = date_8digit.group(1)
formatted = f"{d[:2]}-{d[2:4]}-{d[4:]}"
ttl_text = ttl_text.replace(d, formatted)
# Handle merged city+date like "JAKARTA05-08-1978" - add space before digits
ttl_text = re.sub(r'([A-Z])(\d{2}[-/])', r'\1 \2', ttl_text, flags=re.IGNORECASE)
# Format: "TEMPAT, DD-MM-YYYY" atau "TEMPAT DD-MM-YYYY"
date_match = re.search(r'(\d{2}[-/]\d{2}[-/]\d{4})', ttl_text)
if date_match:
result['tanggal_lahir'] = date_match.group(1)
# Tempat adalah bagian sebelum tanggal
place = ttl_text[:date_match.start()].strip(' ,:-/')
# Clean up label remnants
place = re.sub(r'^(tempat|tgl|lahir||:)[/\s:]*', '', place, flags=re.IGNORECASE).strip()
if place and len(place) > 2:
result['tempat_lahir'] = place.upper()
else:
# Coba split by comma
parts = ttl_text.split(',')
if len(parts) >= 2:
result['tempat_lahir'] = parts[0].strip().upper()
result['tanggal_lahir'] = parts[1].strip()
elif len(parts) == 1 and len(ttl_text) > 2:
result['tempat_lahir'] = ttl_text.upper()
def _post_process(self, result: Dict) -> Dict:
"""Post-processing hasil ekstraksi"""
# Validasi NIK (harus 16 digit)
if result['nik'] and not re.match(r'^\d{16}$', result['nik']):
cleaned = re.sub(r'\D', '', result['nik'])
if len(cleaned) == 16:
result['nik'] = cleaned
else:
result['nik'] = None
# Clean all string values - remove leading colons and extra whitespace
for field in result:
if result[field] and isinstance(result[field], str):
val = result[field]
# Remove leading colons (standard and full-width)
val = re.sub(r'^[\s:]+', '', val)
# Remove trailing colons
val = re.sub(r'[\s:]+$', '', val)
# Remove double spaces
val = re.sub(r'\s+', ' ', val)
result[field] = val.strip()
# Bersihkan label dari values
for field in ['nama', 'alamat', 'tempat_lahir', 'kel_desa', 'kecamatan', 'agama', 'pekerjaan']:
if result[field]:
# Remove common labels yang ter-capture
result[field] = re.sub(
r'^(NAMA|ALAMAT|TEMPAT|LAHIR|TGL|KEL|DESA|KELURAHAN|KECAMATAN|KEC|AGAMA|PEKERJAAN|STATUS)[\s:]*',
'', result[field], flags=re.IGNORECASE
).strip()
# Fix status perkawinan yang masih mengandung label
if result['status_perkawinan']:
sp = result['status_perkawinan']
sp = re.sub(r'^(STATUS|PERKAWINAN)[\s:]*', '', sp, flags=re.IGNORECASE).strip()
result['status_perkawinan'] = sp
# Fix berlaku hingga
if result['berlaku_hingga']:
bh = result['berlaku_hingga']
bh = re.sub(r'^(BERLAKU|HINGGA)[\s:]*', '', bh, flags=re.IGNORECASE).strip()
if bh.upper() == 'HIDUP' or 'SEUMUR' in bh.upper():
result['berlaku_hingga'] = 'SEUMUR HIDUP'
else:
result['berlaku_hingga'] = bh
# Fix merged kabupaten/kota names (e.g., JAKARTASELATAN -> JAKARTA SELATAN)
if result['kabupaten_kota']:
kk = result['kabupaten_kota']
# Add space before directional words
kk = re.sub(r'(JAKARTA|BANDUNG|SURABAYA|SEMARANG|MEDAN|BEKASI|TANGERANG|DEPOK|BOGOR)(SELATAN|UTARA|BARAT|TIMUR|PUSAT|TENGAH)',
r'\1 \2', kk, flags=re.IGNORECASE)
# Common merged patterns
kk = re.sub(r'(KOTA|KABUPATEN)([A-Z])', r'\1 \2', kk, flags=re.IGNORECASE)
result['kabupaten_kota'] = kk.upper()
# Fix merged provinsi names
if result['provinsi']:
prov = result['provinsi']
prov = re.sub(r'(DKI|DI)(JAKARTA|YOGYAKARTA)', r'\1 \2', prov, flags=re.IGNORECASE)
prov = re.sub(r'(JAWA|KALIMANTAN|SULAWESI|SUMATERA|NUSA TENGGARA)(BARAT|TIMUR|TENGAH|SELATAN|UTARA)',
r'\1 \2', prov, flags=re.IGNORECASE)
result['provinsi'] = prov.upper()
# Fix merged alamat/address (e.g., JLKECAPIV -> JL KECAPI V)
if result['alamat']:
alamat = result['alamat']
# Add space after common street prefixes
alamat = re.sub(r'^(JL|JLN|JALAN|GG|GANG|NO|BLOK)([A-Z])', r'\1 \2', alamat, flags=re.IGNORECASE)
# Add space before Roman numerals at the end (I, II, III, IV, V, VI, VII, VIII, IX, X)
alamat = re.sub(r'([A-Z])([IVX]+)$', r'\1 \2', alamat, flags=re.IGNORECASE)
# Add space before single digits/numbers at end
alamat = re.sub(r'([A-Z])(\d+)$', r'\1 \2', alamat, flags=re.IGNORECASE)
# Fix common patterns: "NO123" -> "NO 123", "BLOKA" -> "BLOK A"
alamat = re.sub(r'\b(NO|BLOK)(\d+|[A-Z])\b', r'\1 \2', alamat, flags=re.IGNORECASE)
result['alamat'] = alamat.upper()
return result
if __name__ == "__main__":
# Test
sample_ocr = [
{'text': 'PROVINSI JAWA BARAT'},
{'text': 'KABUPATEN BANDUNG'},
{'text': 'NIK : 3204012345678901'},
{'text': 'Nama : JOHN DOE'},
{'text': 'Tempat/Tgl Lahir : BANDUNG, 01-01-1990'},
{'text': 'Jenis Kelamin : LAKI-LAKI'},
{'text': 'Alamat : JL. MERDEKA NO. 123'},
{'text': 'RT/RW : 001/002'},
{'text': 'Kel/Desa : SUKAMAJU'},
{'text': 'Kecamatan : SUKASARI'},
{'text': 'Agama : ISLAM'},
{'text': 'Status Perkawinan : BELUM KAWIN'},
{'text': 'Pekerjaan : KARYAWAN SWASTA'},
{'text': 'Kewarganegaraan : WNI'},
{'text': 'Berlaku Hingga : SEUMUR HIDUP'},
]
extractor = KTPExtractor()
result = extractor.extract(sample_ocr)
for key, value in result.items():
print(f"{key}: {value}")