OCR dengan ZONA
This commit is contained in:
235
kk_extractor.py
Normal file
235
kk_extractor.py
Normal file
@@ -0,0 +1,235 @@
|
||||
"""
|
||||
KK (Kartu Keluarga) Field Extractor
|
||||
Ekstraksi data terstruktur dari hasil OCR KK Indonesia
|
||||
"""
|
||||
|
||||
import re
|
||||
from typing import Dict, Optional, List
|
||||
|
||||
|
||||
class KKExtractor:
|
||||
"""Ekstrak field dari hasil OCR Kartu Keluarga"""
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def extract(self, ocr_results: List[Dict]) -> Dict:
|
||||
"""
|
||||
Ekstrak field KK dari hasil OCR
|
||||
|
||||
Args:
|
||||
ocr_results: List hasil dari OCREngine.extract_text()
|
||||
|
||||
Returns:
|
||||
Dict dengan field KK
|
||||
"""
|
||||
all_text = '\n'.join([r['text'] for r in ocr_results])
|
||||
|
||||
result = {
|
||||
'no_kk': None,
|
||||
'nama_kepala_keluarga': None,
|
||||
'alamat': None,
|
||||
'rt_rw': None,
|
||||
'kel_desa': None,
|
||||
'kecamatan': None,
|
||||
'kabupaten_kota': None,
|
||||
'provinsi': None,
|
||||
'kode_pos': None,
|
||||
'anggota_keluarga': [],
|
||||
}
|
||||
|
||||
# Ekstrak No KK (16 digit)
|
||||
kk_match = re.search(r'\b(\d{16})\b', all_text)
|
||||
if kk_match:
|
||||
result['no_kk'] = kk_match.group(1)
|
||||
|
||||
# Track untuk deteksi tabel anggota
|
||||
in_table = False
|
||||
table_start_y = None
|
||||
|
||||
for i, ocr in enumerate(ocr_results):
|
||||
text = ocr['text'].strip()
|
||||
text_lower = text.lower()
|
||||
y_pos = ocr.get('y_center', 0)
|
||||
|
||||
# Provinsi
|
||||
if 'provinsi' in text_lower and result['provinsi'] is None:
|
||||
result['provinsi'] = self._extract_value(text, 'provinsi')
|
||||
|
||||
# Kabupaten/Kota
|
||||
if ('kabupaten' in text_lower or 'kota' in text_lower) and result['kabupaten_kota'] is None:
|
||||
val = self._extract_value(text, 'kabupaten') or self._extract_value(text, 'kota')
|
||||
if val:
|
||||
result['kabupaten_kota'] = val
|
||||
else:
|
||||
result['kabupaten_kota'] = text
|
||||
|
||||
# Kecamatan
|
||||
if 'kecamatan' in text_lower and result['kecamatan'] is None:
|
||||
result['kecamatan'] = self._extract_value(text, 'kecamatan')
|
||||
|
||||
# Kelurahan/Desa
|
||||
if ('kelurahan' in text_lower or 'desa' in text_lower) and result['kel_desa'] is None:
|
||||
result['kel_desa'] = self._extract_value(text, 'kelurahan') or self._extract_value(text, 'desa')
|
||||
|
||||
# No. KK dengan label
|
||||
if 'no' in text_lower and ('kk' in text_lower or 'kartu' in text_lower):
|
||||
# Cari 16 digit di text ini atau text berikutnya
|
||||
match = re.search(r'(\d{16})', text)
|
||||
if match:
|
||||
result['no_kk'] = match.group(1)
|
||||
elif i + 1 < len(ocr_results):
|
||||
next_text = ocr_results[i + 1]['text']
|
||||
match = re.search(r'(\d{16})', next_text)
|
||||
if match:
|
||||
result['no_kk'] = match.group(1)
|
||||
|
||||
# Nama Kepala Keluarga
|
||||
if 'kepala' in text_lower and 'keluarga' in text_lower:
|
||||
result['nama_kepala_keluarga'] = self._extract_value(text, 'keluarga')
|
||||
if not result['nama_kepala_keluarga'] and i + 1 < len(ocr_results):
|
||||
# Nama mungkin di baris berikutnya
|
||||
next_text = ocr_results[i + 1]['text'].strip()
|
||||
if not any(kw in next_text.lower() for kw in ['alamat', 'rt', 'rw', 'provinsi']):
|
||||
result['nama_kepala_keluarga'] = next_text
|
||||
|
||||
# Alamat
|
||||
if 'alamat' in text_lower and result['alamat'] is None:
|
||||
result['alamat'] = self._extract_value(text, 'alamat')
|
||||
|
||||
# RT/RW
|
||||
rt_rw_match = re.search(r'rt\s*/?\s*rw\s*[:\s]*(\d+)\s*/\s*(\d+)', text_lower)
|
||||
if rt_rw_match:
|
||||
result['rt_rw'] = f"{rt_rw_match.group(1)}/{rt_rw_match.group(2)}"
|
||||
|
||||
# Kode Pos
|
||||
if 'kode' in text_lower and 'pos' in text_lower:
|
||||
match = re.search(r'(\d{5})', text)
|
||||
if match:
|
||||
result['kode_pos'] = match.group(1)
|
||||
|
||||
# Deteksi header tabel anggota keluarga
|
||||
if self._is_table_header(text_lower):
|
||||
in_table = True
|
||||
table_start_y = y_pos
|
||||
continue
|
||||
|
||||
# Ekstrak anggota keluarga dari tabel
|
||||
if in_table and table_start_y:
|
||||
member = self._extract_member(text, ocr_results, i)
|
||||
if member:
|
||||
result['anggota_keluarga'].append(member)
|
||||
|
||||
# Post-processing
|
||||
result = self._post_process(result)
|
||||
|
||||
return result
|
||||
|
||||
def _extract_value(self, text: str, field: str) -> Optional[str]:
|
||||
"""Ekstrak nilai setelah label field"""
|
||||
patterns = [
|
||||
rf'{field}[a-z]*\s*:\s*(.+)',
|
||||
rf'{field}[a-z]*\s+(.+)',
|
||||
]
|
||||
|
||||
for pattern in patterns:
|
||||
match = re.search(pattern, text, re.IGNORECASE)
|
||||
if match:
|
||||
value = match.group(1).strip()
|
||||
value = re.sub(r'^[:\s]+', '', value)
|
||||
if value:
|
||||
return value
|
||||
return None
|
||||
|
||||
def _is_table_header(self, text: str) -> bool:
|
||||
"""Cek apakah teks adalah header tabel anggota"""
|
||||
header_keywords = ['no', 'nama lengkap', 'nik', 'jenis kelamin', 'hubungan']
|
||||
count = sum(1 for kw in header_keywords if kw in text)
|
||||
return count >= 2
|
||||
|
||||
def _extract_member(self, text: str, all_results: List[Dict], current_idx: int) -> Optional[Dict]:
|
||||
"""Ekstrak data anggota keluarga dari baris tabel"""
|
||||
# Cari NIK di text
|
||||
nik_match = re.search(r'\b(\d{16})\b', text)
|
||||
if not nik_match:
|
||||
return None
|
||||
|
||||
member = {
|
||||
'nik': nik_match.group(1),
|
||||
'nama': None,
|
||||
'jenis_kelamin': None,
|
||||
'tempat_lahir': None,
|
||||
'tanggal_lahir': None,
|
||||
'hubungan': None,
|
||||
}
|
||||
|
||||
# Cari teks di sekitar yang mungkin nama atau info lain
|
||||
text_parts = text.split()
|
||||
|
||||
# Deteksi jenis kelamin
|
||||
if 'laki' in text.lower() or ' l ' in f' {text.lower()} ':
|
||||
member['jenis_kelamin'] = 'LAKI-LAKI'
|
||||
elif 'perempuan' in text.lower() or ' p ' in f' {text.lower()} ':
|
||||
member['jenis_kelamin'] = 'PEREMPUAN'
|
||||
|
||||
# Deteksi hubungan keluarga
|
||||
hubungan_keywords = {
|
||||
'kepala': 'KEPALA KELUARGA',
|
||||
'istri': 'ISTRI',
|
||||
'suami': 'SUAMI',
|
||||
'anak': 'ANAK',
|
||||
'menantu': 'MENANTU',
|
||||
'cucu': 'CUCU',
|
||||
'orang tua': 'ORANG TUA',
|
||||
'mertua': 'MERTUA',
|
||||
}
|
||||
|
||||
for keyword, value in hubungan_keywords.items():
|
||||
if keyword in text.lower():
|
||||
member['hubungan'] = value
|
||||
break
|
||||
|
||||
return member
|
||||
|
||||
def _post_process(self, result: Dict) -> Dict:
|
||||
"""Post-processing hasil ekstraksi"""
|
||||
# Validasi No KK
|
||||
if result['no_kk'] and not re.match(r'^\d{16}$', result['no_kk']):
|
||||
cleaned = re.sub(r'\D', '', result['no_kk'])
|
||||
if len(cleaned) == 16:
|
||||
result['no_kk'] = cleaned
|
||||
else:
|
||||
result['no_kk'] = None
|
||||
|
||||
# Uppercase field teks
|
||||
for field in ['nama_kepala_keluarga', 'alamat', 'kel_desa', 'kecamatan',
|
||||
'kabupaten_kota', 'provinsi']:
|
||||
if result[field]:
|
||||
result[field] = result[field].upper()
|
||||
|
||||
return result
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Test
|
||||
sample_ocr = [
|
||||
{'text': 'KARTU KELUARGA', 'y_center': 10},
|
||||
{'text': 'No. 3204012345678901', 'y_center': 30},
|
||||
{'text': 'Nama Kepala Keluarga : JOHN DOE', 'y_center': 50},
|
||||
{'text': 'Alamat : JL. MERDEKA NO. 123', 'y_center': 70},
|
||||
{'text': 'RT/RW : 001/002', 'y_center': 90},
|
||||
{'text': 'Desa/Kelurahan : SUKAMAJU', 'y_center': 110},
|
||||
{'text': 'Kecamatan : SUKASARI', 'y_center': 130},
|
||||
{'text': 'Kabupaten/Kota : BANDUNG', 'y_center': 150},
|
||||
{'text': 'Provinsi : JAWA BARAT', 'y_center': 170},
|
||||
{'text': 'Kode Pos : 40154', 'y_center': 190},
|
||||
]
|
||||
|
||||
extractor = KKExtractor()
|
||||
result = extractor.extract(sample_ocr)
|
||||
|
||||
for key, value in result.items():
|
||||
if key != 'anggota_keluarga':
|
||||
print(f"{key}: {value}")
|
||||
|
||||
print(f"\nAnggota Keluarga: {len(result['anggota_keluarga'])} orang")
|
||||
Reference in New Issue
Block a user