OCR dengan ZONA
This commit is contained in:
602
ktp_extractor.py
Normal file
602
ktp_extractor.py
Normal file
@@ -0,0 +1,602 @@
|
||||
"""
|
||||
KTP Field Extractor
|
||||
Ekstraksi data terstruktur dari hasil OCR KTP Indonesia
|
||||
Mendukung berbagai format output OCR (full-width colon, standard colon, tanpa colon)
|
||||
"""
|
||||
|
||||
import re
|
||||
from typing import Dict, Optional, List
|
||||
|
||||
|
||||
class KTPExtractor:
|
||||
"""Ekstrak field dari hasil OCR KTP"""
|
||||
|
||||
# Pattern colon yang berbeda-beda (standard, full-width, dll)
|
||||
COLON_PATTERN = r'[:\:]'
|
||||
|
||||
# Keywords untuk jenis kelamin
|
||||
MALE_KEYWORDS = ['laki', 'pria', 'male']
|
||||
FEMALE_KEYWORDS = ['perempuan', 'wanita', 'female']
|
||||
|
||||
# Agama yang valid
|
||||
AGAMA_LIST = ['islam', 'kristen', 'katolik', 'hindu', 'budha', 'buddha', 'konghucu']
|
||||
|
||||
# Pekerjaan umum
|
||||
PEKERJAAN_LIST = ['pelajar', 'mahasiswa', 'pegawai', 'swasta', 'pns', 'wiraswasta',
|
||||
'buruh', 'petani', 'nelayan', 'karyawan', 'ibu rumah tangga',
|
||||
'tidak bekerja', 'lainnya', 'mengurus rumah tangga']
|
||||
|
||||
# KTP Zone Template (normalized coordinates: x_min, y_min, x_max, y_max)
|
||||
# Based on standard KTP layout
|
||||
ZONES = {
|
||||
'header_provinsi': (0.15, 0.00, 0.85, 0.07), # PROVINSI header
|
||||
'header_kabupaten': (0.15, 0.05, 0.85, 0.13), # KABUPATEN header
|
||||
'nik': (0.02, 0.10, 0.70, 0.22), # NIK area
|
||||
'nama': (0.02, 0.18, 0.70, 0.28), # Nama area
|
||||
'ttl': (0.02, 0.25, 0.70, 0.36), # Tempat/Tgl Lahir
|
||||
'jenis_kelamin': (0.02, 0.33, 0.45, 0.42), # Jenis Kelamin (left)
|
||||
'gol_darah': (0.40, 0.33, 0.70, 0.42), # Gol Darah (right of jenis)
|
||||
'alamat': (0.02, 0.38, 0.70, 0.50), # Alamat
|
||||
'rt_rw': (0.02, 0.46, 0.70, 0.54), # RT/RW
|
||||
'kel_desa': (0.02, 0.51, 0.70, 0.60), # Kel/Desa
|
||||
'kecamatan': (0.02, 0.57, 0.70, 0.66), # Kecamatan
|
||||
'agama': (0.02, 0.63, 0.70, 0.72), # Agama
|
||||
'status': (0.02, 0.69, 0.70, 0.78), # Status Perkawinan
|
||||
'pekerjaan': (0.02, 0.75, 0.70, 0.84), # Pekerjaan
|
||||
'wni': (0.02, 0.81, 0.70, 0.90), # Kewarganegaraan
|
||||
'berlaku': (0.02, 0.87, 0.70, 0.96), # Berlaku Hingga
|
||||
'foto': (0.68, 0.10, 0.98, 0.55), # Foto (right side)
|
||||
'penerbitan': (0.65, 0.58, 0.98, 0.98), # Tempat & Tanggal penerbitan
|
||||
}
|
||||
|
||||
def __init__(self):
|
||||
self.image_width = 0
|
||||
self.image_height = 0
|
||||
|
||||
def _get_zone(self, x_center: float, y_center: float, img_width: int, img_height: int) -> Optional[str]:
|
||||
"""Determine which zone a text belongs to based on normalized coordinates"""
|
||||
if img_width == 0 or img_height == 0:
|
||||
return None
|
||||
|
||||
# Normalize coordinates
|
||||
x_norm = x_center / img_width
|
||||
y_norm = y_center / img_height
|
||||
|
||||
for zone_name, (x_min, y_min, x_max, y_max) in self.ZONES.items():
|
||||
if x_min <= x_norm <= x_max and y_min <= y_norm <= y_max:
|
||||
return zone_name
|
||||
return None
|
||||
|
||||
def _extract_value_from_text(self, text: str) -> str:
|
||||
"""Extract value part from label:value text"""
|
||||
# Split by colon (standard or full-width)
|
||||
parts = re.split(r'[::]', text, 1)
|
||||
if len(parts) > 1:
|
||||
return parts[1].strip()
|
||||
return text.strip()
|
||||
|
||||
def _detect_image_size(self, ocr_results: List[Dict]) -> tuple:
|
||||
"""Detect image dimensions from bounding boxes"""
|
||||
max_x, max_y = 0, 0
|
||||
for r in ocr_results:
|
||||
bbox = r.get('bbox', [])
|
||||
if bbox and len(bbox) >= 4:
|
||||
for point in bbox:
|
||||
if len(point) >= 2:
|
||||
max_x = max(max_x, point[0])
|
||||
max_y = max(max_y, point[1])
|
||||
# Add some margin
|
||||
return (int(max_x * 1.05), int(max_y * 1.05)) if max_x > 0 else (1000, 640)
|
||||
|
||||
def _extract_by_zones(self, zone_texts: Dict[str, List[str]], result: Dict):
|
||||
"""Extract fields based on zone assignments"""
|
||||
|
||||
# PROVINSI from header
|
||||
if 'header_provinsi' in zone_texts:
|
||||
for text in zone_texts['header_provinsi']:
|
||||
if 'provinsi' in text.lower():
|
||||
val = re.sub(r'(?i)provinsi\s*', '', text).strip()
|
||||
if val:
|
||||
result['provinsi'] = val.upper()
|
||||
break
|
||||
|
||||
# KABUPATEN/KOTA from header
|
||||
if 'header_kabupaten' in zone_texts:
|
||||
for text in zone_texts['header_kabupaten']:
|
||||
text_lower = text.lower()
|
||||
if 'kabupaten' in text_lower or 'kota' in text_lower:
|
||||
val = re.sub(r'(?i)(kabupaten|kota)\s*', '', text).strip()
|
||||
if val:
|
||||
result['kabupaten_kota'] = val.upper()
|
||||
else:
|
||||
result['kabupaten_kota'] = text.upper()
|
||||
break
|
||||
|
||||
# NAMA from nama zone (skip label line)
|
||||
if 'nama' in zone_texts:
|
||||
for text in zone_texts['nama']:
|
||||
text_lower = text.lower()
|
||||
if 'nama' not in text_lower and len(text) > 2:
|
||||
result['nama'] = text.upper()
|
||||
break
|
||||
elif 'nama' in text_lower:
|
||||
val = self._extract_value_from_text(text)
|
||||
if val and 'nama' not in val.lower():
|
||||
result['nama'] = val.upper()
|
||||
|
||||
# TTL from ttl zone
|
||||
if 'ttl' in zone_texts:
|
||||
for text in zone_texts['ttl']:
|
||||
if 'tempat' in text.lower() or 'lahir' in text.lower():
|
||||
val = self._extract_value_from_text(text)
|
||||
if val:
|
||||
self._parse_ttl(val, result)
|
||||
break
|
||||
|
||||
# JENIS KELAMIN
|
||||
if 'jenis_kelamin' in zone_texts:
|
||||
for text in zone_texts['jenis_kelamin']:
|
||||
text_lower = text.lower()
|
||||
if 'laki' in text_lower:
|
||||
result['jenis_kelamin'] = 'LAKI-LAKI'
|
||||
break
|
||||
elif 'perempuan' in text_lower:
|
||||
result['jenis_kelamin'] = 'PEREMPUAN'
|
||||
break
|
||||
|
||||
# GOL DARAH
|
||||
if 'gol_darah' in zone_texts:
|
||||
for text in zone_texts['gol_darah']:
|
||||
gol_match = re.search(r'([ABO]{1,2}[+\-]?)', text, re.IGNORECASE)
|
||||
if gol_match:
|
||||
result['gol_darah'] = gol_match.group(1).upper()
|
||||
break
|
||||
|
||||
# ALAMAT
|
||||
if 'alamat' in zone_texts:
|
||||
for text in zone_texts['alamat']:
|
||||
if 'alamat' not in text.lower() or len(zone_texts['alamat']) == 1:
|
||||
val = self._extract_value_from_text(text) if 'alamat' in text.lower() else text
|
||||
if val and 'alamat' not in val.lower():
|
||||
result['alamat'] = val.upper()
|
||||
break
|
||||
|
||||
# PENERBITAN area (tempat & tanggal dalam satu zona)
|
||||
if 'penerbitan' in zone_texts:
|
||||
for text in zone_texts['penerbitan']:
|
||||
# Look for date
|
||||
date_match = re.search(r'(\d{2}[-/]\d{2}[-/]\d{4})', text)
|
||||
if date_match and result['tanggal_penerbitan'] is None:
|
||||
result['tanggal_penerbitan'] = date_match.group(1)
|
||||
|
||||
def extract(self, ocr_results: List[Dict]) -> Dict[str, Optional[str]]:
|
||||
"""
|
||||
Ekstrak field KTP dari hasil OCR dengan template-based zone detection
|
||||
|
||||
Args:
|
||||
ocr_results: List hasil dari OCREngine.extract_text()
|
||||
|
||||
Returns:
|
||||
Dict dengan field KTP
|
||||
"""
|
||||
result = {
|
||||
'nik': None,
|
||||
'nama': None,
|
||||
'tempat_lahir': None,
|
||||
'tanggal_lahir': None,
|
||||
'jenis_kelamin': None,
|
||||
'gol_darah': None,
|
||||
'alamat': None,
|
||||
'rt_rw': None,
|
||||
'kel_desa': None,
|
||||
'kecamatan': None,
|
||||
'agama': None,
|
||||
'status_perkawinan': None,
|
||||
'pekerjaan': None,
|
||||
'kewarganegaraan': None,
|
||||
'berlaku_hingga': None,
|
||||
'provinsi': None,
|
||||
'kabupaten_kota': None,
|
||||
'tanggal_penerbitan': None,
|
||||
}
|
||||
|
||||
# Detect image dimensions from bounding boxes
|
||||
img_width, img_height = self._detect_image_size(ocr_results)
|
||||
|
||||
# Assign zones to each OCR result
|
||||
zone_texts = {} # zone_name -> list of texts
|
||||
for r in ocr_results:
|
||||
x_center = r.get('x_center', 0)
|
||||
y_center = r.get('y_center', 0)
|
||||
zone = self._get_zone(x_center, y_center, img_width, img_height)
|
||||
if zone:
|
||||
if zone not in zone_texts:
|
||||
zone_texts[zone] = []
|
||||
zone_texts[zone].append(r['text'])
|
||||
|
||||
# Debug: print zone assignments
|
||||
print("\n[DEBUG KTPExtractor] Zone assignments:")
|
||||
for zone, texts in zone_texts.items():
|
||||
print(f" {zone}: {texts}")
|
||||
|
||||
# Extract fields using zone-based approach
|
||||
self._extract_by_zones(zone_texts, result)
|
||||
|
||||
# Gabungkan semua teks untuk fallback pattern matching
|
||||
texts = [r['text'].strip() for r in ocr_results]
|
||||
all_text = '\n'.join(texts)
|
||||
|
||||
# Ekstrak NIK (16 digit) - bisa ada di mana saja
|
||||
nik_match = re.search(r'\b(\d{16})\b', all_text)
|
||||
if nik_match:
|
||||
result['nik'] = nik_match.group(1)
|
||||
print(f" -> NIK found: {result['nik']}")
|
||||
|
||||
# Fallback: Parse line by line for fields not found by zone
|
||||
for i, text in enumerate(texts):
|
||||
text_lower = text.lower()
|
||||
|
||||
# Normalize colons
|
||||
text_normalized = re.sub(self.COLON_PATTERN, ':', text)
|
||||
text_norm_lower = text_normalized.lower()
|
||||
|
||||
# ===== PROVINSI =====
|
||||
if 'provinsi' in text_lower and result['provinsi'] is None:
|
||||
val = self._extract_after_label(text_normalized, 'provinsi')
|
||||
if val:
|
||||
result['provinsi'] = val.upper()
|
||||
elif i + 1 < len(texts) and 'provinsi' not in texts[i+1].lower():
|
||||
# Mungkin value di line berikutnya
|
||||
result['provinsi'] = texts[i+1].strip().upper()
|
||||
|
||||
# ===== KABUPATEN/KOTA =====
|
||||
if ('kabupaten' in text_lower or 'kota' in text_lower or 'jakarta' in text_lower) and result['kabupaten_kota'] is None:
|
||||
if 'provinsi' not in text_lower: # Bukan bagian dari provinsi
|
||||
val = self._extract_after_label(text_normalized, 'kabupaten|kota')
|
||||
if val:
|
||||
result['kabupaten_kota'] = val.upper()
|
||||
else:
|
||||
result['kabupaten_kota'] = text.strip().upper()
|
||||
|
||||
# ===== NAMA =====
|
||||
if 'nama' in text_lower and result['nama'] is None:
|
||||
val = self._extract_after_label(text_normalized, 'nama')
|
||||
if val and len(val) > 2:
|
||||
result['nama'] = val.upper()
|
||||
elif i + 1 < len(texts):
|
||||
# Nama di line berikutnya
|
||||
next_text = texts[i+1].strip()
|
||||
if len(next_text) > 2 and not any(kw in next_text.lower() for kw in ['tempat', 'lahir', 'jenis']):
|
||||
result['nama'] = next_text.upper()
|
||||
|
||||
# ===== TEMPAT/TANGGAL LAHIR =====
|
||||
# Match "Tempat/Tgl Lahir" or "Tempat Lahir" or similar labels
|
||||
if 'tempat' in text_lower or ('lahir' in text_lower and 'berlaku' not in text_lower):
|
||||
if result['tempat_lahir'] is None or result['tanggal_lahir'] is None:
|
||||
# Extract value after label using full-width or standard colon
|
||||
ttl = self._extract_after_label(text_normalized, r'tempat[/\s]*tgl[/\s]*lahir|tempat[/\s]*lahir|lahir')
|
||||
if ttl:
|
||||
self._parse_ttl(ttl, result)
|
||||
elif ':' in text or ':' in text:
|
||||
# Value is after colon but _extract_after_label didn't catch it
|
||||
parts = re.split(r'[::]', text, 1)
|
||||
if len(parts) > 1 and parts[1].strip():
|
||||
self._parse_ttl(parts[1].strip(), result)
|
||||
elif i + 1 < len(texts):
|
||||
# TTL di line berikutnya
|
||||
next_text = texts[i+1].strip()
|
||||
if not any(kw in next_text.lower() for kw in ['jenis', 'kelamin', 'alamat', 'gol']):
|
||||
self._parse_ttl(next_text, result)
|
||||
|
||||
# ===== JENIS KELAMIN =====
|
||||
if any(kw in text_lower for kw in self.MALE_KEYWORDS):
|
||||
if result['jenis_kelamin'] is None:
|
||||
result['jenis_kelamin'] = 'LAKI-LAKI'
|
||||
elif any(kw in text_lower for kw in self.FEMALE_KEYWORDS):
|
||||
if result['jenis_kelamin'] is None:
|
||||
result['jenis_kelamin'] = 'PEREMPUAN'
|
||||
|
||||
# ===== GOLONGAN DARAH =====
|
||||
if 'darah' in text_lower or 'gol.' in text_lower:
|
||||
# Try to find blood type on same line
|
||||
gol_match = re.search(r'(?:gol|darah)[.\s::]*([ABO]{1,2}[+\-]?)', text, re.IGNORECASE)
|
||||
if gol_match and result['gol_darah'] is None:
|
||||
result['gol_darah'] = gol_match.group(1).upper()
|
||||
elif result['gol_darah'] is None and i + 1 < len(texts):
|
||||
# Blood type might be on next line (real KTP pattern)
|
||||
next_text = texts[i+1].strip()
|
||||
if re.match(r'^[ABO]{1,2}[+\-]?$', next_text, re.IGNORECASE):
|
||||
result['gol_darah'] = next_text.upper()
|
||||
# Standalone blood type (e.g., just "O" or "A+" on its own line)
|
||||
if result['gol_darah'] is None:
|
||||
if re.match(r'^[ABO]{1,2}[+\-]?$', text.strip(), re.IGNORECASE) and len(text.strip()) <= 3:
|
||||
result['gol_darah'] = text.strip().upper()
|
||||
|
||||
# ===== ALAMAT =====
|
||||
if 'alamat' in text_lower and result['alamat'] is None:
|
||||
val = self._extract_after_label(text_normalized, 'alamat')
|
||||
if val:
|
||||
result['alamat'] = val.upper()
|
||||
elif i + 1 < len(texts):
|
||||
result['alamat'] = texts[i+1].strip().upper()
|
||||
|
||||
# ===== RT/RW =====
|
||||
rt_rw_match = re.search(r'(\d{3})\s*/\s*(\d{3})', text)
|
||||
if rt_rw_match:
|
||||
result['rt_rw'] = f"{rt_rw_match.group(1)}/{rt_rw_match.group(2)}"
|
||||
|
||||
# ===== KELURAHAN/DESA =====
|
||||
if ('kel' in text_lower or 'desa' in text_lower) and 'kelamin' not in text_lower:
|
||||
if result['kel_desa'] is None:
|
||||
val = self._extract_after_label(text_normalized, 'kel|desa')
|
||||
if val:
|
||||
result['kel_desa'] = val.upper()
|
||||
elif i + 1 < len(texts):
|
||||
result['kel_desa'] = texts[i+1].strip().upper()
|
||||
|
||||
# ===== KECAMATAN =====
|
||||
if 'kecamatan' in text_lower or ('kec' in text_lower and 'kelamin' not in text_lower):
|
||||
if result['kecamatan'] is None:
|
||||
val = self._extract_after_label(text_normalized, 'kecamatan|kec')
|
||||
if val:
|
||||
result['kecamatan'] = val.upper()
|
||||
elif i + 1 < len(texts):
|
||||
# Value on next line (real KTP pattern)
|
||||
next_text = texts[i+1].strip()
|
||||
if len(next_text) > 2 and not any(kw in next_text.lower() for kw in ['agama', 'status', 'pekerjaan']):
|
||||
result['kecamatan'] = next_text.upper()
|
||||
|
||||
# ===== AGAMA =====
|
||||
if 'agama' in text_lower:
|
||||
val = self._extract_after_label(text_normalized, 'agama')
|
||||
if val and result['agama'] is None:
|
||||
result['agama'] = val.upper()
|
||||
elif result['agama'] is None and i + 1 < len(texts):
|
||||
# Value on next line (real KTP pattern)
|
||||
next_text = texts[i+1].strip().upper()
|
||||
if next_text in ['ISLAM', 'KRISTEN', 'KATOLIK', 'HINDU', 'BUDHA', 'BUDDHA', 'KONGHUCU']:
|
||||
result['agama'] = next_text
|
||||
else:
|
||||
# Check if line contains only agama name
|
||||
for agama in self.AGAMA_LIST:
|
||||
if agama in text_lower and len(text) < 20:
|
||||
if result['agama'] is None:
|
||||
result['agama'] = text.strip().upper()
|
||||
break
|
||||
|
||||
# ===== STATUS PERKAWINAN =====
|
||||
if 'kawin' in text_lower:
|
||||
if result['status_perkawinan'] is None:
|
||||
val = self._extract_after_label(text_normalized, 'status.*kawin|perkawinan')
|
||||
if val:
|
||||
result['status_perkawinan'] = val.upper()
|
||||
elif 'belum' in text_lower:
|
||||
result['status_perkawinan'] = 'BELUM KAWIN'
|
||||
elif 'kawin' in text_lower and 'cerai' not in text_lower:
|
||||
result['status_perkawinan'] = 'KAWIN'
|
||||
elif 'cerai hidup' in text_lower:
|
||||
result['status_perkawinan'] = 'CERAI HIDUP'
|
||||
elif 'cerai mati' in text_lower:
|
||||
result['status_perkawinan'] = 'CERAI MATI'
|
||||
|
||||
# ===== PEKERJAAN =====
|
||||
if 'pekerjaan' in text_lower:
|
||||
val = self._extract_after_label(text_normalized, 'pekerjaan')
|
||||
if val and result['pekerjaan'] is None:
|
||||
result['pekerjaan'] = val.upper()
|
||||
elif result['pekerjaan'] is None and i + 1 < len(texts):
|
||||
# Value on next line (real KTP pattern)
|
||||
next_text = texts[i+1].strip()
|
||||
if len(next_text) > 2 and 'kewarganegaraan' not in next_text.lower():
|
||||
result['pekerjaan'] = next_text.upper()
|
||||
else:
|
||||
# Check if line contains pekerjaan keyword
|
||||
for pekerjaan in self.PEKERJAAN_LIST:
|
||||
if pekerjaan in text_lower and len(text) < 30:
|
||||
if result['pekerjaan'] is None:
|
||||
result['pekerjaan'] = text.strip().upper()
|
||||
break
|
||||
|
||||
# ===== KEWARGANEGARAAN =====
|
||||
if 'wni' in text_lower:
|
||||
result['kewarganegaraan'] = 'WNI'
|
||||
elif 'wna' in text_lower:
|
||||
result['kewarganegaraan'] = 'WNA'
|
||||
elif 'warga' in text_lower and result['kewarganegaraan'] is None:
|
||||
val = self._extract_after_label(text_normalized, 'kewarganegaraan|warga')
|
||||
if val:
|
||||
result['kewarganegaraan'] = val.upper()
|
||||
|
||||
# ===== BERLAKU HINGGA =====
|
||||
if 'berlaku' in text_lower or 'seumur' in text_lower:
|
||||
if result['berlaku_hingga'] is None:
|
||||
if 'seumur' in text_lower or 'hidup' in text_lower:
|
||||
result['berlaku_hingga'] = 'SEUMUR HIDUP'
|
||||
else:
|
||||
val = self._extract_after_label(text_normalized, 'berlaku')
|
||||
if val:
|
||||
result['berlaku_hingga'] = val.upper()
|
||||
|
||||
# ===== TANGGAL PENERBITAN (biasanya format DD-MM-YYYY di akhir) =====
|
||||
# Look for date that is NOT tanggal lahir (different date)
|
||||
if result['tanggal_penerbitan'] is None:
|
||||
# Match date format at end of text or standalone date
|
||||
date_match = re.search(r'(\d{2}[-/]\d{2}[-/]\d{4})$', text.strip())
|
||||
if date_match:
|
||||
found_date = date_match.group(1)
|
||||
# Make sure it's not the same as tanggal_lahir
|
||||
if result['tanggal_lahir'] != found_date:
|
||||
# Likely penerbitan if after berlaku_hingga was found
|
||||
if result['berlaku_hingga'] or i > len(texts) * 0.7:
|
||||
result['tanggal_penerbitan'] = found_date
|
||||
|
||||
# Post-processing
|
||||
result = self._post_process(result)
|
||||
|
||||
return result
|
||||
|
||||
def _extract_after_label(self, text: str, label_pattern: str) -> Optional[str]:
|
||||
"""Ekstrak nilai setelah label (supports various separators)"""
|
||||
patterns = [
|
||||
rf'(?:{label_pattern})\s*:\s*(.+)', # label: value
|
||||
rf'(?:{label_pattern})\s+([A-Z0-9].+)', # label VALUE (uppercase start)
|
||||
]
|
||||
|
||||
for pattern in patterns:
|
||||
match = re.search(pattern, text, re.IGNORECASE)
|
||||
if match:
|
||||
value = match.group(1).strip()
|
||||
# Remove trailing colon or label fragment
|
||||
value = re.sub(r'^[:\s]+', '', value)
|
||||
value = re.sub(r'\s*:\s*$', '', value)
|
||||
if value and len(value) > 1:
|
||||
return value
|
||||
|
||||
return None
|
||||
|
||||
def _parse_ttl(self, ttl_text: str, result: Dict):
|
||||
"""Parse tempat/tanggal lahir dari text"""
|
||||
ttl_text = ttl_text.strip()
|
||||
|
||||
# Normalize dates where OCR missed dashes:
|
||||
# "05 08 1978" -> "05-08-1978"
|
||||
# "05 08-1978" -> "05-08-1978"
|
||||
# "05-08 1978" -> "05-08-1978"
|
||||
ttl_text = re.sub(r'(\d{2})[\s]+(\d{2})[\s]+(\d{4})', r'\1-\2-\3', ttl_text)
|
||||
ttl_text = re.sub(r'(\d{2})[\s]+(\d{2})[-/](\d{4})', r'\1-\2-\3', ttl_text)
|
||||
ttl_text = re.sub(r'(\d{2})[-/](\d{2})[\s]+(\d{4})', r'\1-\2-\3', ttl_text)
|
||||
|
||||
# Handle 8-digit date without separator: "05081978" -> "05-08-1978"
|
||||
date_8digit = re.search(r'(\d{8})', ttl_text)
|
||||
if date_8digit:
|
||||
d = date_8digit.group(1)
|
||||
formatted = f"{d[:2]}-{d[2:4]}-{d[4:]}"
|
||||
ttl_text = ttl_text.replace(d, formatted)
|
||||
|
||||
# Handle merged city+date like "JAKARTA05-08-1978" - add space before digits
|
||||
ttl_text = re.sub(r'([A-Z])(\d{2}[-/])', r'\1 \2', ttl_text, flags=re.IGNORECASE)
|
||||
|
||||
# Format: "TEMPAT, DD-MM-YYYY" atau "TEMPAT DD-MM-YYYY"
|
||||
date_match = re.search(r'(\d{2}[-/]\d{2}[-/]\d{4})', ttl_text)
|
||||
if date_match:
|
||||
result['tanggal_lahir'] = date_match.group(1)
|
||||
# Tempat adalah bagian sebelum tanggal
|
||||
place = ttl_text[:date_match.start()].strip(' ,:-/')
|
||||
# Clean up label remnants
|
||||
place = re.sub(r'^(tempat|tgl|lahir|:|:)[/\s::]*', '', place, flags=re.IGNORECASE).strip()
|
||||
if place and len(place) > 2:
|
||||
result['tempat_lahir'] = place.upper()
|
||||
else:
|
||||
# Coba split by comma
|
||||
parts = ttl_text.split(',')
|
||||
if len(parts) >= 2:
|
||||
result['tempat_lahir'] = parts[0].strip().upper()
|
||||
result['tanggal_lahir'] = parts[1].strip()
|
||||
elif len(parts) == 1 and len(ttl_text) > 2:
|
||||
result['tempat_lahir'] = ttl_text.upper()
|
||||
|
||||
def _post_process(self, result: Dict) -> Dict:
|
||||
"""Post-processing hasil ekstraksi"""
|
||||
# Validasi NIK (harus 16 digit)
|
||||
if result['nik'] and not re.match(r'^\d{16}$', result['nik']):
|
||||
cleaned = re.sub(r'\D', '', result['nik'])
|
||||
if len(cleaned) == 16:
|
||||
result['nik'] = cleaned
|
||||
else:
|
||||
result['nik'] = None
|
||||
|
||||
# Clean all string values - remove leading colons and extra whitespace
|
||||
for field in result:
|
||||
if result[field] and isinstance(result[field], str):
|
||||
val = result[field]
|
||||
# Remove leading colons (standard and full-width)
|
||||
val = re.sub(r'^[\s::]+', '', val)
|
||||
# Remove trailing colons
|
||||
val = re.sub(r'[\s::]+$', '', val)
|
||||
# Remove double spaces
|
||||
val = re.sub(r'\s+', ' ', val)
|
||||
result[field] = val.strip()
|
||||
|
||||
# Bersihkan label dari values
|
||||
for field in ['nama', 'alamat', 'tempat_lahir', 'kel_desa', 'kecamatan', 'agama', 'pekerjaan']:
|
||||
if result[field]:
|
||||
# Remove common labels yang ter-capture
|
||||
result[field] = re.sub(
|
||||
r'^(NAMA|ALAMAT|TEMPAT|LAHIR|TGL|KEL|DESA|KELURAHAN|KECAMATAN|KEC|AGAMA|PEKERJAAN|STATUS)[\s::]*',
|
||||
'', result[field], flags=re.IGNORECASE
|
||||
).strip()
|
||||
|
||||
# Fix status perkawinan yang masih mengandung label
|
||||
if result['status_perkawinan']:
|
||||
sp = result['status_perkawinan']
|
||||
sp = re.sub(r'^(STATUS|PERKAWINAN)[\s::]*', '', sp, flags=re.IGNORECASE).strip()
|
||||
result['status_perkawinan'] = sp
|
||||
|
||||
# Fix berlaku hingga
|
||||
if result['berlaku_hingga']:
|
||||
bh = result['berlaku_hingga']
|
||||
bh = re.sub(r'^(BERLAKU|HINGGA)[\s::]*', '', bh, flags=re.IGNORECASE).strip()
|
||||
if bh.upper() == 'HIDUP' or 'SEUMUR' in bh.upper():
|
||||
result['berlaku_hingga'] = 'SEUMUR HIDUP'
|
||||
else:
|
||||
result['berlaku_hingga'] = bh
|
||||
|
||||
# Fix merged kabupaten/kota names (e.g., JAKARTASELATAN -> JAKARTA SELATAN)
|
||||
if result['kabupaten_kota']:
|
||||
kk = result['kabupaten_kota']
|
||||
# Add space before directional words
|
||||
kk = re.sub(r'(JAKARTA|BANDUNG|SURABAYA|SEMARANG|MEDAN|BEKASI|TANGERANG|DEPOK|BOGOR)(SELATAN|UTARA|BARAT|TIMUR|PUSAT|TENGAH)',
|
||||
r'\1 \2', kk, flags=re.IGNORECASE)
|
||||
# Common merged patterns
|
||||
kk = re.sub(r'(KOTA|KABUPATEN)([A-Z])', r'\1 \2', kk, flags=re.IGNORECASE)
|
||||
result['kabupaten_kota'] = kk.upper()
|
||||
|
||||
# Fix merged provinsi names
|
||||
if result['provinsi']:
|
||||
prov = result['provinsi']
|
||||
prov = re.sub(r'(DKI|DI)(JAKARTA|YOGYAKARTA)', r'\1 \2', prov, flags=re.IGNORECASE)
|
||||
prov = re.sub(r'(JAWA|KALIMANTAN|SULAWESI|SUMATERA|NUSA TENGGARA)(BARAT|TIMUR|TENGAH|SELATAN|UTARA)',
|
||||
r'\1 \2', prov, flags=re.IGNORECASE)
|
||||
result['provinsi'] = prov.upper()
|
||||
|
||||
# Fix merged alamat/address (e.g., JLKECAPIV -> JL KECAPI V)
|
||||
if result['alamat']:
|
||||
alamat = result['alamat']
|
||||
# Add space after common street prefixes
|
||||
alamat = re.sub(r'^(JL|JLN|JALAN|GG|GANG|NO|BLOK)([A-Z])', r'\1 \2', alamat, flags=re.IGNORECASE)
|
||||
# Add space before Roman numerals at the end (I, II, III, IV, V, VI, VII, VIII, IX, X)
|
||||
alamat = re.sub(r'([A-Z])([IVX]+)$', r'\1 \2', alamat, flags=re.IGNORECASE)
|
||||
# Add space before single digits/numbers at end
|
||||
alamat = re.sub(r'([A-Z])(\d+)$', r'\1 \2', alamat, flags=re.IGNORECASE)
|
||||
# Fix common patterns: "NO123" -> "NO 123", "BLOKA" -> "BLOK A"
|
||||
alamat = re.sub(r'\b(NO|BLOK)(\d+|[A-Z])\b', r'\1 \2', alamat, flags=re.IGNORECASE)
|
||||
result['alamat'] = alamat.upper()
|
||||
|
||||
return result
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Test
|
||||
sample_ocr = [
|
||||
{'text': 'PROVINSI JAWA BARAT'},
|
||||
{'text': 'KABUPATEN BANDUNG'},
|
||||
{'text': 'NIK : 3204012345678901'},
|
||||
{'text': 'Nama : JOHN DOE'},
|
||||
{'text': 'Tempat/Tgl Lahir : BANDUNG, 01-01-1990'},
|
||||
{'text': 'Jenis Kelamin : LAKI-LAKI'},
|
||||
{'text': 'Alamat : JL. MERDEKA NO. 123'},
|
||||
{'text': 'RT/RW : 001/002'},
|
||||
{'text': 'Kel/Desa : SUKAMAJU'},
|
||||
{'text': 'Kecamatan : SUKASARI'},
|
||||
{'text': 'Agama : ISLAM'},
|
||||
{'text': 'Status Perkawinan : BELUM KAWIN'},
|
||||
{'text': 'Pekerjaan : KARYAWAN SWASTA'},
|
||||
{'text': 'Kewarganegaraan : WNI'},
|
||||
{'text': 'Berlaku Hingga : SEUMUR HIDUP'},
|
||||
]
|
||||
|
||||
extractor = KTPExtractor()
|
||||
result = extractor.extract(sample_ocr)
|
||||
|
||||
for key, value in result.items():
|
||||
print(f"{key}: {value}")
|
||||
Reference in New Issue
Block a user