OCR dengan ZONA

This commit is contained in:
2025-12-28 01:20:37 +08:00
commit 4fe381b3f0
12 changed files with 2356 additions and 0 deletions

602
ktp_extractor.py Normal file
View File

@@ -0,0 +1,602 @@
"""
KTP Field Extractor
Ekstraksi data terstruktur dari hasil OCR KTP Indonesia
Mendukung berbagai format output OCR (full-width colon, standard colon, tanpa colon)
"""
import re
from typing import Dict, Optional, List
class KTPExtractor:
"""Ekstrak field dari hasil OCR KTP"""
# Pattern colon yang berbeda-beda (standard, full-width, dll)
COLON_PATTERN = r'[:\]'
# Keywords untuk jenis kelamin
MALE_KEYWORDS = ['laki', 'pria', 'male']
FEMALE_KEYWORDS = ['perempuan', 'wanita', 'female']
# Agama yang valid
AGAMA_LIST = ['islam', 'kristen', 'katolik', 'hindu', 'budha', 'buddha', 'konghucu']
# Pekerjaan umum
PEKERJAAN_LIST = ['pelajar', 'mahasiswa', 'pegawai', 'swasta', 'pns', 'wiraswasta',
'buruh', 'petani', 'nelayan', 'karyawan', 'ibu rumah tangga',
'tidak bekerja', 'lainnya', 'mengurus rumah tangga']
# KTP Zone Template (normalized coordinates: x_min, y_min, x_max, y_max)
# Based on standard KTP layout
ZONES = {
'header_provinsi': (0.15, 0.00, 0.85, 0.07), # PROVINSI header
'header_kabupaten': (0.15, 0.05, 0.85, 0.13), # KABUPATEN header
'nik': (0.02, 0.10, 0.70, 0.22), # NIK area
'nama': (0.02, 0.18, 0.70, 0.28), # Nama area
'ttl': (0.02, 0.25, 0.70, 0.36), # Tempat/Tgl Lahir
'jenis_kelamin': (0.02, 0.33, 0.45, 0.42), # Jenis Kelamin (left)
'gol_darah': (0.40, 0.33, 0.70, 0.42), # Gol Darah (right of jenis)
'alamat': (0.02, 0.38, 0.70, 0.50), # Alamat
'rt_rw': (0.02, 0.46, 0.70, 0.54), # RT/RW
'kel_desa': (0.02, 0.51, 0.70, 0.60), # Kel/Desa
'kecamatan': (0.02, 0.57, 0.70, 0.66), # Kecamatan
'agama': (0.02, 0.63, 0.70, 0.72), # Agama
'status': (0.02, 0.69, 0.70, 0.78), # Status Perkawinan
'pekerjaan': (0.02, 0.75, 0.70, 0.84), # Pekerjaan
'wni': (0.02, 0.81, 0.70, 0.90), # Kewarganegaraan
'berlaku': (0.02, 0.87, 0.70, 0.96), # Berlaku Hingga
'foto': (0.68, 0.10, 0.98, 0.55), # Foto (right side)
'penerbitan': (0.65, 0.58, 0.98, 0.98), # Tempat & Tanggal penerbitan
}
def __init__(self):
self.image_width = 0
self.image_height = 0
def _get_zone(self, x_center: float, y_center: float, img_width: int, img_height: int) -> Optional[str]:
"""Determine which zone a text belongs to based on normalized coordinates"""
if img_width == 0 or img_height == 0:
return None
# Normalize coordinates
x_norm = x_center / img_width
y_norm = y_center / img_height
for zone_name, (x_min, y_min, x_max, y_max) in self.ZONES.items():
if x_min <= x_norm <= x_max and y_min <= y_norm <= y_max:
return zone_name
return None
def _extract_value_from_text(self, text: str) -> str:
"""Extract value part from label:value text"""
# Split by colon (standard or full-width)
parts = re.split(r'[:]', text, 1)
if len(parts) > 1:
return parts[1].strip()
return text.strip()
def _detect_image_size(self, ocr_results: List[Dict]) -> tuple:
"""Detect image dimensions from bounding boxes"""
max_x, max_y = 0, 0
for r in ocr_results:
bbox = r.get('bbox', [])
if bbox and len(bbox) >= 4:
for point in bbox:
if len(point) >= 2:
max_x = max(max_x, point[0])
max_y = max(max_y, point[1])
# Add some margin
return (int(max_x * 1.05), int(max_y * 1.05)) if max_x > 0 else (1000, 640)
def _extract_by_zones(self, zone_texts: Dict[str, List[str]], result: Dict):
"""Extract fields based on zone assignments"""
# PROVINSI from header
if 'header_provinsi' in zone_texts:
for text in zone_texts['header_provinsi']:
if 'provinsi' in text.lower():
val = re.sub(r'(?i)provinsi\s*', '', text).strip()
if val:
result['provinsi'] = val.upper()
break
# KABUPATEN/KOTA from header
if 'header_kabupaten' in zone_texts:
for text in zone_texts['header_kabupaten']:
text_lower = text.lower()
if 'kabupaten' in text_lower or 'kota' in text_lower:
val = re.sub(r'(?i)(kabupaten|kota)\s*', '', text).strip()
if val:
result['kabupaten_kota'] = val.upper()
else:
result['kabupaten_kota'] = text.upper()
break
# NAMA from nama zone (skip label line)
if 'nama' in zone_texts:
for text in zone_texts['nama']:
text_lower = text.lower()
if 'nama' not in text_lower and len(text) > 2:
result['nama'] = text.upper()
break
elif 'nama' in text_lower:
val = self._extract_value_from_text(text)
if val and 'nama' not in val.lower():
result['nama'] = val.upper()
# TTL from ttl zone
if 'ttl' in zone_texts:
for text in zone_texts['ttl']:
if 'tempat' in text.lower() or 'lahir' in text.lower():
val = self._extract_value_from_text(text)
if val:
self._parse_ttl(val, result)
break
# JENIS KELAMIN
if 'jenis_kelamin' in zone_texts:
for text in zone_texts['jenis_kelamin']:
text_lower = text.lower()
if 'laki' in text_lower:
result['jenis_kelamin'] = 'LAKI-LAKI'
break
elif 'perempuan' in text_lower:
result['jenis_kelamin'] = 'PEREMPUAN'
break
# GOL DARAH
if 'gol_darah' in zone_texts:
for text in zone_texts['gol_darah']:
gol_match = re.search(r'([ABO]{1,2}[+\-]?)', text, re.IGNORECASE)
if gol_match:
result['gol_darah'] = gol_match.group(1).upper()
break
# ALAMAT
if 'alamat' in zone_texts:
for text in zone_texts['alamat']:
if 'alamat' not in text.lower() or len(zone_texts['alamat']) == 1:
val = self._extract_value_from_text(text) if 'alamat' in text.lower() else text
if val and 'alamat' not in val.lower():
result['alamat'] = val.upper()
break
# PENERBITAN area (tempat & tanggal dalam satu zona)
if 'penerbitan' in zone_texts:
for text in zone_texts['penerbitan']:
# Look for date
date_match = re.search(r'(\d{2}[-/]\d{2}[-/]\d{4})', text)
if date_match and result['tanggal_penerbitan'] is None:
result['tanggal_penerbitan'] = date_match.group(1)
def extract(self, ocr_results: List[Dict]) -> Dict[str, Optional[str]]:
"""
Ekstrak field KTP dari hasil OCR dengan template-based zone detection
Args:
ocr_results: List hasil dari OCREngine.extract_text()
Returns:
Dict dengan field KTP
"""
result = {
'nik': None,
'nama': None,
'tempat_lahir': None,
'tanggal_lahir': None,
'jenis_kelamin': None,
'gol_darah': None,
'alamat': None,
'rt_rw': None,
'kel_desa': None,
'kecamatan': None,
'agama': None,
'status_perkawinan': None,
'pekerjaan': None,
'kewarganegaraan': None,
'berlaku_hingga': None,
'provinsi': None,
'kabupaten_kota': None,
'tanggal_penerbitan': None,
}
# Detect image dimensions from bounding boxes
img_width, img_height = self._detect_image_size(ocr_results)
# Assign zones to each OCR result
zone_texts = {} # zone_name -> list of texts
for r in ocr_results:
x_center = r.get('x_center', 0)
y_center = r.get('y_center', 0)
zone = self._get_zone(x_center, y_center, img_width, img_height)
if zone:
if zone not in zone_texts:
zone_texts[zone] = []
zone_texts[zone].append(r['text'])
# Debug: print zone assignments
print("\n[DEBUG KTPExtractor] Zone assignments:")
for zone, texts in zone_texts.items():
print(f" {zone}: {texts}")
# Extract fields using zone-based approach
self._extract_by_zones(zone_texts, result)
# Gabungkan semua teks untuk fallback pattern matching
texts = [r['text'].strip() for r in ocr_results]
all_text = '\n'.join(texts)
# Ekstrak NIK (16 digit) - bisa ada di mana saja
nik_match = re.search(r'\b(\d{16})\b', all_text)
if nik_match:
result['nik'] = nik_match.group(1)
print(f" -> NIK found: {result['nik']}")
# Fallback: Parse line by line for fields not found by zone
for i, text in enumerate(texts):
text_lower = text.lower()
# Normalize colons
text_normalized = re.sub(self.COLON_PATTERN, ':', text)
text_norm_lower = text_normalized.lower()
# ===== PROVINSI =====
if 'provinsi' in text_lower and result['provinsi'] is None:
val = self._extract_after_label(text_normalized, 'provinsi')
if val:
result['provinsi'] = val.upper()
elif i + 1 < len(texts) and 'provinsi' not in texts[i+1].lower():
# Mungkin value di line berikutnya
result['provinsi'] = texts[i+1].strip().upper()
# ===== KABUPATEN/KOTA =====
if ('kabupaten' in text_lower or 'kota' in text_lower or 'jakarta' in text_lower) and result['kabupaten_kota'] is None:
if 'provinsi' not in text_lower: # Bukan bagian dari provinsi
val = self._extract_after_label(text_normalized, 'kabupaten|kota')
if val:
result['kabupaten_kota'] = val.upper()
else:
result['kabupaten_kota'] = text.strip().upper()
# ===== NAMA =====
if 'nama' in text_lower and result['nama'] is None:
val = self._extract_after_label(text_normalized, 'nama')
if val and len(val) > 2:
result['nama'] = val.upper()
elif i + 1 < len(texts):
# Nama di line berikutnya
next_text = texts[i+1].strip()
if len(next_text) > 2 and not any(kw in next_text.lower() for kw in ['tempat', 'lahir', 'jenis']):
result['nama'] = next_text.upper()
# ===== TEMPAT/TANGGAL LAHIR =====
# Match "Tempat/Tgl Lahir" or "Tempat Lahir" or similar labels
if 'tempat' in text_lower or ('lahir' in text_lower and 'berlaku' not in text_lower):
if result['tempat_lahir'] is None or result['tanggal_lahir'] is None:
# Extract value after label using full-width or standard colon
ttl = self._extract_after_label(text_normalized, r'tempat[/\s]*tgl[/\s]*lahir|tempat[/\s]*lahir|lahir')
if ttl:
self._parse_ttl(ttl, result)
elif '' in text or ':' in text:
# Value is after colon but _extract_after_label didn't catch it
parts = re.split(r'[:]', text, 1)
if len(parts) > 1 and parts[1].strip():
self._parse_ttl(parts[1].strip(), result)
elif i + 1 < len(texts):
# TTL di line berikutnya
next_text = texts[i+1].strip()
if not any(kw in next_text.lower() for kw in ['jenis', 'kelamin', 'alamat', 'gol']):
self._parse_ttl(next_text, result)
# ===== JENIS KELAMIN =====
if any(kw in text_lower for kw in self.MALE_KEYWORDS):
if result['jenis_kelamin'] is None:
result['jenis_kelamin'] = 'LAKI-LAKI'
elif any(kw in text_lower for kw in self.FEMALE_KEYWORDS):
if result['jenis_kelamin'] is None:
result['jenis_kelamin'] = 'PEREMPUAN'
# ===== GOLONGAN DARAH =====
if 'darah' in text_lower or 'gol.' in text_lower:
# Try to find blood type on same line
gol_match = re.search(r'(?:gol|darah)[.\s:]*([ABO]{1,2}[+\-]?)', text, re.IGNORECASE)
if gol_match and result['gol_darah'] is None:
result['gol_darah'] = gol_match.group(1).upper()
elif result['gol_darah'] is None and i + 1 < len(texts):
# Blood type might be on next line (real KTP pattern)
next_text = texts[i+1].strip()
if re.match(r'^[ABO]{1,2}[+\-]?$', next_text, re.IGNORECASE):
result['gol_darah'] = next_text.upper()
# Standalone blood type (e.g., just "O" or "A+" on its own line)
if result['gol_darah'] is None:
if re.match(r'^[ABO]{1,2}[+\-]?$', text.strip(), re.IGNORECASE) and len(text.strip()) <= 3:
result['gol_darah'] = text.strip().upper()
# ===== ALAMAT =====
if 'alamat' in text_lower and result['alamat'] is None:
val = self._extract_after_label(text_normalized, 'alamat')
if val:
result['alamat'] = val.upper()
elif i + 1 < len(texts):
result['alamat'] = texts[i+1].strip().upper()
# ===== RT/RW =====
rt_rw_match = re.search(r'(\d{3})\s*/\s*(\d{3})', text)
if rt_rw_match:
result['rt_rw'] = f"{rt_rw_match.group(1)}/{rt_rw_match.group(2)}"
# ===== KELURAHAN/DESA =====
if ('kel' in text_lower or 'desa' in text_lower) and 'kelamin' not in text_lower:
if result['kel_desa'] is None:
val = self._extract_after_label(text_normalized, 'kel|desa')
if val:
result['kel_desa'] = val.upper()
elif i + 1 < len(texts):
result['kel_desa'] = texts[i+1].strip().upper()
# ===== KECAMATAN =====
if 'kecamatan' in text_lower or ('kec' in text_lower and 'kelamin' not in text_lower):
if result['kecamatan'] is None:
val = self._extract_after_label(text_normalized, 'kecamatan|kec')
if val:
result['kecamatan'] = val.upper()
elif i + 1 < len(texts):
# Value on next line (real KTP pattern)
next_text = texts[i+1].strip()
if len(next_text) > 2 and not any(kw in next_text.lower() for kw in ['agama', 'status', 'pekerjaan']):
result['kecamatan'] = next_text.upper()
# ===== AGAMA =====
if 'agama' in text_lower:
val = self._extract_after_label(text_normalized, 'agama')
if val and result['agama'] is None:
result['agama'] = val.upper()
elif result['agama'] is None and i + 1 < len(texts):
# Value on next line (real KTP pattern)
next_text = texts[i+1].strip().upper()
if next_text in ['ISLAM', 'KRISTEN', 'KATOLIK', 'HINDU', 'BUDHA', 'BUDDHA', 'KONGHUCU']:
result['agama'] = next_text
else:
# Check if line contains only agama name
for agama in self.AGAMA_LIST:
if agama in text_lower and len(text) < 20:
if result['agama'] is None:
result['agama'] = text.strip().upper()
break
# ===== STATUS PERKAWINAN =====
if 'kawin' in text_lower:
if result['status_perkawinan'] is None:
val = self._extract_after_label(text_normalized, 'status.*kawin|perkawinan')
if val:
result['status_perkawinan'] = val.upper()
elif 'belum' in text_lower:
result['status_perkawinan'] = 'BELUM KAWIN'
elif 'kawin' in text_lower and 'cerai' not in text_lower:
result['status_perkawinan'] = 'KAWIN'
elif 'cerai hidup' in text_lower:
result['status_perkawinan'] = 'CERAI HIDUP'
elif 'cerai mati' in text_lower:
result['status_perkawinan'] = 'CERAI MATI'
# ===== PEKERJAAN =====
if 'pekerjaan' in text_lower:
val = self._extract_after_label(text_normalized, 'pekerjaan')
if val and result['pekerjaan'] is None:
result['pekerjaan'] = val.upper()
elif result['pekerjaan'] is None and i + 1 < len(texts):
# Value on next line (real KTP pattern)
next_text = texts[i+1].strip()
if len(next_text) > 2 and 'kewarganegaraan' not in next_text.lower():
result['pekerjaan'] = next_text.upper()
else:
# Check if line contains pekerjaan keyword
for pekerjaan in self.PEKERJAAN_LIST:
if pekerjaan in text_lower and len(text) < 30:
if result['pekerjaan'] is None:
result['pekerjaan'] = text.strip().upper()
break
# ===== KEWARGANEGARAAN =====
if 'wni' in text_lower:
result['kewarganegaraan'] = 'WNI'
elif 'wna' in text_lower:
result['kewarganegaraan'] = 'WNA'
elif 'warga' in text_lower and result['kewarganegaraan'] is None:
val = self._extract_after_label(text_normalized, 'kewarganegaraan|warga')
if val:
result['kewarganegaraan'] = val.upper()
# ===== BERLAKU HINGGA =====
if 'berlaku' in text_lower or 'seumur' in text_lower:
if result['berlaku_hingga'] is None:
if 'seumur' in text_lower or 'hidup' in text_lower:
result['berlaku_hingga'] = 'SEUMUR HIDUP'
else:
val = self._extract_after_label(text_normalized, 'berlaku')
if val:
result['berlaku_hingga'] = val.upper()
# ===== TANGGAL PENERBITAN (biasanya format DD-MM-YYYY di akhir) =====
# Look for date that is NOT tanggal lahir (different date)
if result['tanggal_penerbitan'] is None:
# Match date format at end of text or standalone date
date_match = re.search(r'(\d{2}[-/]\d{2}[-/]\d{4})$', text.strip())
if date_match:
found_date = date_match.group(1)
# Make sure it's not the same as tanggal_lahir
if result['tanggal_lahir'] != found_date:
# Likely penerbitan if after berlaku_hingga was found
if result['berlaku_hingga'] or i > len(texts) * 0.7:
result['tanggal_penerbitan'] = found_date
# Post-processing
result = self._post_process(result)
return result
def _extract_after_label(self, text: str, label_pattern: str) -> Optional[str]:
"""Ekstrak nilai setelah label (supports various separators)"""
patterns = [
rf'(?:{label_pattern})\s*:\s*(.+)', # label: value
rf'(?:{label_pattern})\s+([A-Z0-9].+)', # label VALUE (uppercase start)
]
for pattern in patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
value = match.group(1).strip()
# Remove trailing colon or label fragment
value = re.sub(r'^[:\s]+', '', value)
value = re.sub(r'\s*:\s*$', '', value)
if value and len(value) > 1:
return value
return None
def _parse_ttl(self, ttl_text: str, result: Dict):
"""Parse tempat/tanggal lahir dari text"""
ttl_text = ttl_text.strip()
# Normalize dates where OCR missed dashes:
# "05 08 1978" -> "05-08-1978"
# "05 08-1978" -> "05-08-1978"
# "05-08 1978" -> "05-08-1978"
ttl_text = re.sub(r'(\d{2})[\s]+(\d{2})[\s]+(\d{4})', r'\1-\2-\3', ttl_text)
ttl_text = re.sub(r'(\d{2})[\s]+(\d{2})[-/](\d{4})', r'\1-\2-\3', ttl_text)
ttl_text = re.sub(r'(\d{2})[-/](\d{2})[\s]+(\d{4})', r'\1-\2-\3', ttl_text)
# Handle 8-digit date without separator: "05081978" -> "05-08-1978"
date_8digit = re.search(r'(\d{8})', ttl_text)
if date_8digit:
d = date_8digit.group(1)
formatted = f"{d[:2]}-{d[2:4]}-{d[4:]}"
ttl_text = ttl_text.replace(d, formatted)
# Handle merged city+date like "JAKARTA05-08-1978" - add space before digits
ttl_text = re.sub(r'([A-Z])(\d{2}[-/])', r'\1 \2', ttl_text, flags=re.IGNORECASE)
# Format: "TEMPAT, DD-MM-YYYY" atau "TEMPAT DD-MM-YYYY"
date_match = re.search(r'(\d{2}[-/]\d{2}[-/]\d{4})', ttl_text)
if date_match:
result['tanggal_lahir'] = date_match.group(1)
# Tempat adalah bagian sebelum tanggal
place = ttl_text[:date_match.start()].strip(' ,:-/')
# Clean up label remnants
place = re.sub(r'^(tempat|tgl|lahir||:)[/\s:]*', '', place, flags=re.IGNORECASE).strip()
if place and len(place) > 2:
result['tempat_lahir'] = place.upper()
else:
# Coba split by comma
parts = ttl_text.split(',')
if len(parts) >= 2:
result['tempat_lahir'] = parts[0].strip().upper()
result['tanggal_lahir'] = parts[1].strip()
elif len(parts) == 1 and len(ttl_text) > 2:
result['tempat_lahir'] = ttl_text.upper()
def _post_process(self, result: Dict) -> Dict:
"""Post-processing hasil ekstraksi"""
# Validasi NIK (harus 16 digit)
if result['nik'] and not re.match(r'^\d{16}$', result['nik']):
cleaned = re.sub(r'\D', '', result['nik'])
if len(cleaned) == 16:
result['nik'] = cleaned
else:
result['nik'] = None
# Clean all string values - remove leading colons and extra whitespace
for field in result:
if result[field] and isinstance(result[field], str):
val = result[field]
# Remove leading colons (standard and full-width)
val = re.sub(r'^[\s:]+', '', val)
# Remove trailing colons
val = re.sub(r'[\s:]+$', '', val)
# Remove double spaces
val = re.sub(r'\s+', ' ', val)
result[field] = val.strip()
# Bersihkan label dari values
for field in ['nama', 'alamat', 'tempat_lahir', 'kel_desa', 'kecamatan', 'agama', 'pekerjaan']:
if result[field]:
# Remove common labels yang ter-capture
result[field] = re.sub(
r'^(NAMA|ALAMAT|TEMPAT|LAHIR|TGL|KEL|DESA|KELURAHAN|KECAMATAN|KEC|AGAMA|PEKERJAAN|STATUS)[\s:]*',
'', result[field], flags=re.IGNORECASE
).strip()
# Fix status perkawinan yang masih mengandung label
if result['status_perkawinan']:
sp = result['status_perkawinan']
sp = re.sub(r'^(STATUS|PERKAWINAN)[\s:]*', '', sp, flags=re.IGNORECASE).strip()
result['status_perkawinan'] = sp
# Fix berlaku hingga
if result['berlaku_hingga']:
bh = result['berlaku_hingga']
bh = re.sub(r'^(BERLAKU|HINGGA)[\s:]*', '', bh, flags=re.IGNORECASE).strip()
if bh.upper() == 'HIDUP' or 'SEUMUR' in bh.upper():
result['berlaku_hingga'] = 'SEUMUR HIDUP'
else:
result['berlaku_hingga'] = bh
# Fix merged kabupaten/kota names (e.g., JAKARTASELATAN -> JAKARTA SELATAN)
if result['kabupaten_kota']:
kk = result['kabupaten_kota']
# Add space before directional words
kk = re.sub(r'(JAKARTA|BANDUNG|SURABAYA|SEMARANG|MEDAN|BEKASI|TANGERANG|DEPOK|BOGOR)(SELATAN|UTARA|BARAT|TIMUR|PUSAT|TENGAH)',
r'\1 \2', kk, flags=re.IGNORECASE)
# Common merged patterns
kk = re.sub(r'(KOTA|KABUPATEN)([A-Z])', r'\1 \2', kk, flags=re.IGNORECASE)
result['kabupaten_kota'] = kk.upper()
# Fix merged provinsi names
if result['provinsi']:
prov = result['provinsi']
prov = re.sub(r'(DKI|DI)(JAKARTA|YOGYAKARTA)', r'\1 \2', prov, flags=re.IGNORECASE)
prov = re.sub(r'(JAWA|KALIMANTAN|SULAWESI|SUMATERA|NUSA TENGGARA)(BARAT|TIMUR|TENGAH|SELATAN|UTARA)',
r'\1 \2', prov, flags=re.IGNORECASE)
result['provinsi'] = prov.upper()
# Fix merged alamat/address (e.g., JLKECAPIV -> JL KECAPI V)
if result['alamat']:
alamat = result['alamat']
# Add space after common street prefixes
alamat = re.sub(r'^(JL|JLN|JALAN|GG|GANG|NO|BLOK)([A-Z])', r'\1 \2', alamat, flags=re.IGNORECASE)
# Add space before Roman numerals at the end (I, II, III, IV, V, VI, VII, VIII, IX, X)
alamat = re.sub(r'([A-Z])([IVX]+)$', r'\1 \2', alamat, flags=re.IGNORECASE)
# Add space before single digits/numbers at end
alamat = re.sub(r'([A-Z])(\d+)$', r'\1 \2', alamat, flags=re.IGNORECASE)
# Fix common patterns: "NO123" -> "NO 123", "BLOKA" -> "BLOK A"
alamat = re.sub(r'\b(NO|BLOK)(\d+|[A-Z])\b', r'\1 \2', alamat, flags=re.IGNORECASE)
result['alamat'] = alamat.upper()
return result
if __name__ == "__main__":
# Test
sample_ocr = [
{'text': 'PROVINSI JAWA BARAT'},
{'text': 'KABUPATEN BANDUNG'},
{'text': 'NIK : 3204012345678901'},
{'text': 'Nama : JOHN DOE'},
{'text': 'Tempat/Tgl Lahir : BANDUNG, 01-01-1990'},
{'text': 'Jenis Kelamin : LAKI-LAKI'},
{'text': 'Alamat : JL. MERDEKA NO. 123'},
{'text': 'RT/RW : 001/002'},
{'text': 'Kel/Desa : SUKAMAJU'},
{'text': 'Kecamatan : SUKASARI'},
{'text': 'Agama : ISLAM'},
{'text': 'Status Perkawinan : BELUM KAWIN'},
{'text': 'Pekerjaan : KARYAWAN SWASTA'},
{'text': 'Kewarganegaraan : WNI'},
{'text': 'Berlaku Hingga : SEUMUR HIDUP'},
]
extractor = KTPExtractor()
result = extractor.extract(sample_ocr)
for key, value in result.items():
print(f"{key}: {value}")