236 lines
8.5 KiB
Python
236 lines
8.5 KiB
Python
"""
|
|
KK (Kartu Keluarga) Field Extractor
|
|
Ekstraksi data terstruktur dari hasil OCR KK Indonesia
|
|
"""
|
|
|
|
import re
|
|
from typing import Dict, Optional, List
|
|
|
|
|
|
class KKExtractor:
|
|
"""Ekstrak field dari hasil OCR Kartu Keluarga"""
|
|
|
|
def __init__(self):
|
|
pass
|
|
|
|
def extract(self, ocr_results: List[Dict]) -> Dict:
|
|
"""
|
|
Ekstrak field KK dari hasil OCR
|
|
|
|
Args:
|
|
ocr_results: List hasil dari OCREngine.extract_text()
|
|
|
|
Returns:
|
|
Dict dengan field KK
|
|
"""
|
|
all_text = '\n'.join([r['text'] for r in ocr_results])
|
|
|
|
result = {
|
|
'no_kk': None,
|
|
'nama_kepala_keluarga': None,
|
|
'alamat': None,
|
|
'rt_rw': None,
|
|
'kel_desa': None,
|
|
'kecamatan': None,
|
|
'kabupaten_kota': None,
|
|
'provinsi': None,
|
|
'kode_pos': None,
|
|
'anggota_keluarga': [],
|
|
}
|
|
|
|
# Ekstrak No KK (16 digit)
|
|
kk_match = re.search(r'\b(\d{16})\b', all_text)
|
|
if kk_match:
|
|
result['no_kk'] = kk_match.group(1)
|
|
|
|
# Track untuk deteksi tabel anggota
|
|
in_table = False
|
|
table_start_y = None
|
|
|
|
for i, ocr in enumerate(ocr_results):
|
|
text = ocr['text'].strip()
|
|
text_lower = text.lower()
|
|
y_pos = ocr.get('y_center', 0)
|
|
|
|
# Provinsi
|
|
if 'provinsi' in text_lower and result['provinsi'] is None:
|
|
result['provinsi'] = self._extract_value(text, 'provinsi')
|
|
|
|
# Kabupaten/Kota
|
|
if ('kabupaten' in text_lower or 'kota' in text_lower) and result['kabupaten_kota'] is None:
|
|
val = self._extract_value(text, 'kabupaten') or self._extract_value(text, 'kota')
|
|
if val:
|
|
result['kabupaten_kota'] = val
|
|
else:
|
|
result['kabupaten_kota'] = text
|
|
|
|
# Kecamatan
|
|
if 'kecamatan' in text_lower and result['kecamatan'] is None:
|
|
result['kecamatan'] = self._extract_value(text, 'kecamatan')
|
|
|
|
# Kelurahan/Desa
|
|
if ('kelurahan' in text_lower or 'desa' in text_lower) and result['kel_desa'] is None:
|
|
result['kel_desa'] = self._extract_value(text, 'kelurahan') or self._extract_value(text, 'desa')
|
|
|
|
# No. KK dengan label
|
|
if 'no' in text_lower and ('kk' in text_lower or 'kartu' in text_lower):
|
|
# Cari 16 digit di text ini atau text berikutnya
|
|
match = re.search(r'(\d{16})', text)
|
|
if match:
|
|
result['no_kk'] = match.group(1)
|
|
elif i + 1 < len(ocr_results):
|
|
next_text = ocr_results[i + 1]['text']
|
|
match = re.search(r'(\d{16})', next_text)
|
|
if match:
|
|
result['no_kk'] = match.group(1)
|
|
|
|
# Nama Kepala Keluarga
|
|
if 'kepala' in text_lower and 'keluarga' in text_lower:
|
|
result['nama_kepala_keluarga'] = self._extract_value(text, 'keluarga')
|
|
if not result['nama_kepala_keluarga'] and i + 1 < len(ocr_results):
|
|
# Nama mungkin di baris berikutnya
|
|
next_text = ocr_results[i + 1]['text'].strip()
|
|
if not any(kw in next_text.lower() for kw in ['alamat', 'rt', 'rw', 'provinsi']):
|
|
result['nama_kepala_keluarga'] = next_text
|
|
|
|
# Alamat
|
|
if 'alamat' in text_lower and result['alamat'] is None:
|
|
result['alamat'] = self._extract_value(text, 'alamat')
|
|
|
|
# RT/RW
|
|
rt_rw_match = re.search(r'rt\s*/?\s*rw\s*[:\s]*(\d+)\s*/\s*(\d+)', text_lower)
|
|
if rt_rw_match:
|
|
result['rt_rw'] = f"{rt_rw_match.group(1)}/{rt_rw_match.group(2)}"
|
|
|
|
# Kode Pos
|
|
if 'kode' in text_lower and 'pos' in text_lower:
|
|
match = re.search(r'(\d{5})', text)
|
|
if match:
|
|
result['kode_pos'] = match.group(1)
|
|
|
|
# Deteksi header tabel anggota keluarga
|
|
if self._is_table_header(text_lower):
|
|
in_table = True
|
|
table_start_y = y_pos
|
|
continue
|
|
|
|
# Ekstrak anggota keluarga dari tabel
|
|
if in_table and table_start_y:
|
|
member = self._extract_member(text, ocr_results, i)
|
|
if member:
|
|
result['anggota_keluarga'].append(member)
|
|
|
|
# Post-processing
|
|
result = self._post_process(result)
|
|
|
|
return result
|
|
|
|
def _extract_value(self, text: str, field: str) -> Optional[str]:
|
|
"""Ekstrak nilai setelah label field"""
|
|
patterns = [
|
|
rf'{field}[a-z]*\s*:\s*(.+)',
|
|
rf'{field}[a-z]*\s+(.+)',
|
|
]
|
|
|
|
for pattern in patterns:
|
|
match = re.search(pattern, text, re.IGNORECASE)
|
|
if match:
|
|
value = match.group(1).strip()
|
|
value = re.sub(r'^[:\s]+', '', value)
|
|
if value:
|
|
return value
|
|
return None
|
|
|
|
def _is_table_header(self, text: str) -> bool:
|
|
"""Cek apakah teks adalah header tabel anggota"""
|
|
header_keywords = ['no', 'nama lengkap', 'nik', 'jenis kelamin', 'hubungan']
|
|
count = sum(1 for kw in header_keywords if kw in text)
|
|
return count >= 2
|
|
|
|
def _extract_member(self, text: str, all_results: List[Dict], current_idx: int) -> Optional[Dict]:
|
|
"""Ekstrak data anggota keluarga dari baris tabel"""
|
|
# Cari NIK di text
|
|
nik_match = re.search(r'\b(\d{16})\b', text)
|
|
if not nik_match:
|
|
return None
|
|
|
|
member = {
|
|
'nik': nik_match.group(1),
|
|
'nama': None,
|
|
'jenis_kelamin': None,
|
|
'tempat_lahir': None,
|
|
'tanggal_lahir': None,
|
|
'hubungan': None,
|
|
}
|
|
|
|
# Cari teks di sekitar yang mungkin nama atau info lain
|
|
text_parts = text.split()
|
|
|
|
# Deteksi jenis kelamin
|
|
if 'laki' in text.lower() or ' l ' in f' {text.lower()} ':
|
|
member['jenis_kelamin'] = 'LAKI-LAKI'
|
|
elif 'perempuan' in text.lower() or ' p ' in f' {text.lower()} ':
|
|
member['jenis_kelamin'] = 'PEREMPUAN'
|
|
|
|
# Deteksi hubungan keluarga
|
|
hubungan_keywords = {
|
|
'kepala': 'KEPALA KELUARGA',
|
|
'istri': 'ISTRI',
|
|
'suami': 'SUAMI',
|
|
'anak': 'ANAK',
|
|
'menantu': 'MENANTU',
|
|
'cucu': 'CUCU',
|
|
'orang tua': 'ORANG TUA',
|
|
'mertua': 'MERTUA',
|
|
}
|
|
|
|
for keyword, value in hubungan_keywords.items():
|
|
if keyword in text.lower():
|
|
member['hubungan'] = value
|
|
break
|
|
|
|
return member
|
|
|
|
def _post_process(self, result: Dict) -> Dict:
|
|
"""Post-processing hasil ekstraksi"""
|
|
# Validasi No KK
|
|
if result['no_kk'] and not re.match(r'^\d{16}$', result['no_kk']):
|
|
cleaned = re.sub(r'\D', '', result['no_kk'])
|
|
if len(cleaned) == 16:
|
|
result['no_kk'] = cleaned
|
|
else:
|
|
result['no_kk'] = None
|
|
|
|
# Uppercase field teks
|
|
for field in ['nama_kepala_keluarga', 'alamat', 'kel_desa', 'kecamatan',
|
|
'kabupaten_kota', 'provinsi']:
|
|
if result[field]:
|
|
result[field] = result[field].upper()
|
|
|
|
return result
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# Test
|
|
sample_ocr = [
|
|
{'text': 'KARTU KELUARGA', 'y_center': 10},
|
|
{'text': 'No. 3204012345678901', 'y_center': 30},
|
|
{'text': 'Nama Kepala Keluarga : JOHN DOE', 'y_center': 50},
|
|
{'text': 'Alamat : JL. MERDEKA NO. 123', 'y_center': 70},
|
|
{'text': 'RT/RW : 001/002', 'y_center': 90},
|
|
{'text': 'Desa/Kelurahan : SUKAMAJU', 'y_center': 110},
|
|
{'text': 'Kecamatan : SUKASARI', 'y_center': 130},
|
|
{'text': 'Kabupaten/Kota : BANDUNG', 'y_center': 150},
|
|
{'text': 'Provinsi : JAWA BARAT', 'y_center': 170},
|
|
{'text': 'Kode Pos : 40154', 'y_center': 190},
|
|
]
|
|
|
|
extractor = KKExtractor()
|
|
result = extractor.extract(sample_ocr)
|
|
|
|
for key, value in result.items():
|
|
if key != 'anggota_keluarga':
|
|
print(f"{key}: {value}")
|
|
|
|
print(f"\nAnggota Keluarga: {len(result['anggota_keluarga'])} orang")
|