OCR dengan ZONA

2025-12-28 01:20:37 +08:00
commit 4fe381b3f0
12 changed files with 2356 additions and 0 deletions
--- a/pycache/kk_extractor.cpython-313.pyc
+++ b/pycache/kk_extractor.cpython-313.pyc
--- a/pycache/ktp_extractor.cpython-313.pyc
+++ b/pycache/ktp_extractor.cpython-313.pyc
--- a/pycache/ocr_engine.cpython-313.pyc
+++ b/pycache/ocr_engine.cpython-313.pyc
--- a/app.py
+++ b/app.py
@@ -0,0 +1,253 @@
 """
 Flask Web Server untuk OCR KTP/KK
 """
 import os
 from flask import Flask, render_template, request, jsonify
 from werkzeug.utils import secure_filename
 from ocr_engine import get_ocr_engine
 from ktp_extractor import KTPExtractor
 from kk_extractor import KKExtractor
 app = Flask(__name__)
 # Konfigurasi
 UPLOAD_FOLDER = os.path.join(os.path.dirname(__file__), 'uploads')
 ALLOWED_EXTENSIONS = {'png', 'jpg', 'jpeg', 'bmp', 'webp'}
 MAX_CONTENT_LENGTH = 16 * 1024 * 1024  # 16MB max
 app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
 app.config['MAX_CONTENT_LENGTH'] = MAX_CONTENT_LENGTH
 # Buat folder upload jika belum ada
 os.makedirs(UPLOAD_FOLDER, exist_ok=True)
 # Inisialisasi extractors
 ktp_extractor = KTPExtractor()
 kk_extractor = KKExtractor()
 def allowed_file(filename):
    return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
@app.route('/')
 def index():
    """Halaman utama"""
    return render_template('index.html')
@app.route('/upload', methods=['POST'])
 def upload_file():
    """Handle upload dan proses OCR"""
    try:
        # Cek file
        if 'file' not in request.files:
            return jsonify({'success': False, 'error': 'Tidak ada file yang diupload'}), 400
        file = request.files['file']
        doc_type = request.form.get('doc_type', 'ktp')
        if file.filename == '':
            return jsonify({'success': False, 'error': 'Nama file kosong'}), 400
        if not allowed_file(file.filename):
            return jsonify({'success': False, 'error': 'Format file tidak didukung. Gunakan PNG, JPG, JPEG, BMP, atau WEBP'}), 400
        # Simpan file
        filename = secure_filename(file.filename)
        filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
        file.save(filepath)
        try:
            # Jalankan OCR
            ocr_engine = get_ocr_engine()
            ocr_results = ocr_engine.extract_text(filepath)
            if not ocr_results:
                return jsonify({
                    'success': False, 
                    'error': 'Tidak dapat membaca teks dari gambar. Pastikan gambar jelas dan tidak blur.'
                }), 400
            # Ekstrak field berdasarkan jenis dokumen
            if doc_type == 'ktp':
                extracted = ktp_extractor.extract(ocr_results)
            else:
                extracted = kk_extractor.extract(ocr_results)
            # Raw text untuk debugging
            raw_text = '\n'.join([r['text'] for r in ocr_results])
            # DEBUG: Print raw OCR results
            print("\n" + "="*50)
            print("DEBUG: Raw OCR Results")
            print("="*50)
            for i, r in enumerate(ocr_results):
                print(f"[{i}] {r['text']}")
            print("="*50 + "\n")
            return jsonify({
                'success': True,
                'doc_type': doc_type,
                'data': extracted,
                'raw_text': raw_text,
                'ocr_count': len(ocr_results)
            })
        finally:
            # Hapus file setelah proses (untuk keamanan data pribadi)
            if os.path.exists(filepath):
                os.remove(filepath)
    except Exception as e:
        return jsonify({'success': False, 'error': str(e)}), 500
 # ============================================
 # Region Data API (using wilayah.id)
 # ============================================
 import requests
 from functools import lru_cache
 WILAYAH_API_BASE = "https://wilayah.id/api"
@lru_cache(maxsize=100)
 def fetch_region_data(endpoint):
    """Fetch region data with caching"""
    try:
        response = requests.get(f"{WILAYAH_API_BASE}/{endpoint}", timeout=10)
        if response.status_code == 200:
            return response.json()
        return None
    except Exception as e:
        print(f"Error fetching region data: {e}")
        return None
 def normalize_name(name):
    """Normalize name for comparison"""
    if not name:
        return ""
    return name.upper().strip().replace(".", "").replace(" ", "")
 def find_best_match(search_name, items, key='name'):
    """Find best matching item by name (fuzzy matching)"""
    if not search_name or not items:
        return None
    search_norm = normalize_name(search_name)
    # Try exact match first
    for item in items:
        if normalize_name(item.get(key, '')) == search_norm:
            return item
    # Try contains match
    for item in items:
        item_norm = normalize_name(item.get(key, ''))
        if search_norm in item_norm or item_norm in search_norm:
            return item
    return None
@app.route('/api/provinces')
 def get_provinces():
    """Get all provinces"""
    data = fetch_region_data("provinces.json")
    if data:
        return jsonify(data)
    return jsonify({'data': []}), 500
@app.route('/api/regencies/<province_code>')
 def get_regencies(province_code):
    """Get cities/regencies by province code"""
    data = fetch_region_data(f"regencies/{province_code}.json")
    if data:
        return jsonify(data)
    return jsonify({'data': []}), 500
@app.route('/api/districts/<regency_code>')
 def get_districts(regency_code):
    """Get districts by regency code"""
    data = fetch_region_data(f"districts/{regency_code}.json")
    if data:
        return jsonify(data)
    return jsonify({'data': []}), 500
@app.route('/api/villages/<district_code>')
 def get_villages(district_code):
    """Get villages by district code"""
    data = fetch_region_data(f"villages/{district_code}.json")
    if data:
        return jsonify(data)
    return jsonify({'data': []}), 500
@app.route('/api/validate-region', methods=['POST'])
 def validate_region():
    """Validate OCR region data against official database"""
    try:
        ocr_data = request.json
        result = {
            'provinsi': {'valid': False, 'code': None, 'suggestion': None},
            'kabupaten_kota': {'valid': False, 'code': None, 'suggestion': None},
            'kecamatan': {'valid': False, 'code': None, 'suggestion': None},
            'kel_desa': {'valid': False, 'code': None, 'suggestion': None}
        }
        # Validate province
        provinces_data = fetch_region_data("provinces.json")
        if provinces_data and 'data' in provinces_data:
            match = find_best_match(ocr_data.get('provinsi'), provinces_data['data'])
            if match:
                result['provinsi'] = {'valid': True, 'code': match['code'], 'suggestion': match['name']}
                # Validate regency
                regencies_data = fetch_region_data(f"regencies/{match['code']}.json")
                if regencies_data and 'data' in regencies_data:
                    reg_match = find_best_match(ocr_data.get('kabupaten_kota'), regencies_data['data'])
                    if reg_match:
                        result['kabupaten_kota'] = {'valid': True, 'code': reg_match['code'], 'suggestion': reg_match['name']}
                        # Validate district
                        districts_data = fetch_region_data(f"districts/{reg_match['code']}.json")
                        if districts_data and 'data' in districts_data:
                            dist_match = find_best_match(ocr_data.get('kecamatan'), districts_data['data'])
                            if dist_match:
                                result['kecamatan'] = {'valid': True, 'code': dist_match['code'], 'suggestion': dist_match['name']}
                                # Validate village
                                villages_data = fetch_region_data(f"villages/{dist_match['code']}.json")
                                if villages_data and 'data' in villages_data:
                                    vil_match = find_best_match(ocr_data.get('kel_desa'), villages_data['data'])
                                    if vil_match:
                                        result['kel_desa'] = {'valid': True, 'code': vil_match['code'], 'suggestion': vil_match['name']}
        return jsonify({'success': True, 'validation': result})
    except Exception as e:
        return jsonify({'success': False, 'error': str(e)}), 500
@app.route('/health')
 def health():
    """Health check endpoint"""
    return jsonify({'status': 'ok'})
 if __name__ == '__main__':
    print("="*50)
    print("OCR KTP/KK Application")
    print("="*50)
    print("Membuka: http://localhost:5000")
    print("Tekan Ctrl+C untuk berhenti")
    print("="*50)
    app.run(host='0.0.0.0', port=5000, debug=True)
--- a/kk.png
+++ b/kk.png
--- a/kk_extractor.py
+++ b/kk_extractor.py
@@ -0,0 +1,235 @@
 """
 KK (Kartu Keluarga) Field Extractor
 Ekstraksi data terstruktur dari hasil OCR KK Indonesia
 """
 import re
 from typing import Dict, Optional, List
 class KKExtractor:
    """Ekstrak field dari hasil OCR Kartu Keluarga"""
    def __init__(self):
        pass
    def extract(self, ocr_results: List[Dict]) -> Dict:
        """
        Ekstrak field KK dari hasil OCR
        Args:
            ocr_results: List hasil dari OCREngine.extract_text()
        Returns:
            Dict dengan field KK
        """
        all_text = '\n'.join([r['text'] for r in ocr_results])
        result = {
            'no_kk': None,
            'nama_kepala_keluarga': None,
            'alamat': None,
            'rt_rw': None,
            'kel_desa': None,
            'kecamatan': None,
            'kabupaten_kota': None,
            'provinsi': None,
            'kode_pos': None,
            'anggota_keluarga': [],
        }
        # Ekstrak No KK (16 digit)
        kk_match = re.search(r'\b(\d{16})\b', all_text)
        if kk_match:
            result['no_kk'] = kk_match.group(1)
        # Track untuk deteksi tabel anggota
        in_table = False
        table_start_y = None
        for i, ocr in enumerate(ocr_results):
            text = ocr['text'].strip()
            text_lower = text.lower()
            y_pos = ocr.get('y_center', 0)
            # Provinsi
            if 'provinsi' in text_lower and result['provinsi'] is None:
                result['provinsi'] = self._extract_value(text, 'provinsi')
            # Kabupaten/Kota
            if ('kabupaten' in text_lower or 'kota' in text_lower) and result['kabupaten_kota'] is None:
                val = self._extract_value(text, 'kabupaten') or self._extract_value(text, 'kota')
                if val:
                    result['kabupaten_kota'] = val
                else:
                    result['kabupaten_kota'] = text
            # Kecamatan
            if 'kecamatan' in text_lower and result['kecamatan'] is None:
                result['kecamatan'] = self._extract_value(text, 'kecamatan')
            # Kelurahan/Desa
            if ('kelurahan' in text_lower or 'desa' in text_lower) and result['kel_desa'] is None:
                result['kel_desa'] = self._extract_value(text, 'kelurahan') or self._extract_value(text, 'desa')
            # No. KK dengan label
            if 'no' in text_lower and ('kk' in text_lower or 'kartu' in text_lower):
                # Cari 16 digit di text ini atau text berikutnya
                match = re.search(r'(\d{16})', text)
                if match:
                    result['no_kk'] = match.group(1)
                elif i + 1 < len(ocr_results):
                    next_text = ocr_results[i + 1]['text']
                    match = re.search(r'(\d{16})', next_text)
                    if match:
                        result['no_kk'] = match.group(1)
            # Nama Kepala Keluarga
            if 'kepala' in text_lower and 'keluarga' in text_lower:
                result['nama_kepala_keluarga'] = self._extract_value(text, 'keluarga')
                if not result['nama_kepala_keluarga'] and i + 1 < len(ocr_results):
                    # Nama mungkin di baris berikutnya
                    next_text = ocr_results[i + 1]['text'].strip()
                    if not any(kw in next_text.lower() for kw in ['alamat', 'rt', 'rw', 'provinsi']):
                        result['nama_kepala_keluarga'] = next_text
            # Alamat
            if 'alamat' in text_lower and result['alamat'] is None:
                result['alamat'] = self._extract_value(text, 'alamat')
            # RT/RW
            rt_rw_match = re.search(r'rt\s*/?\s*rw\s*[:\s]*(\d+)\s*/\s*(\d+)', text_lower)
            if rt_rw_match:
                result['rt_rw'] = f"{rt_rw_match.group(1)}/{rt_rw_match.group(2)}"
            # Kode Pos
            if 'kode' in text_lower and 'pos' in text_lower:
                match = re.search(r'(\d{5})', text)
                if match:
                    result['kode_pos'] = match.group(1)
            # Deteksi header tabel anggota keluarga
            if self._is_table_header(text_lower):
                in_table = True
                table_start_y = y_pos
                continue
            # Ekstrak anggota keluarga dari tabel
            if in_table and table_start_y:
                member = self._extract_member(text, ocr_results, i)
                if member:
                    result['anggota_keluarga'].append(member)
        # Post-processing
        result = self._post_process(result)
        return result
    def _extract_value(self, text: str, field: str) -> Optional[str]:
        """Ekstrak nilai setelah label field"""
        patterns = [
            rf'{field}[a-z]*\s*:\s*(.+)',
            rf'{field}[a-z]*\s+(.+)',
        ]
        for pattern in patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                value = match.group(1).strip()
                value = re.sub(r'^[:\s]+', '', value)
                if value:
                    return value
        return None
    def _is_table_header(self, text: str) -> bool:
        """Cek apakah teks adalah header tabel anggota"""
        header_keywords = ['no', 'nama lengkap', 'nik', 'jenis kelamin', 'hubungan']
        count = sum(1 for kw in header_keywords if kw in text)
        return count >= 2
    def _extract_member(self, text: str, all_results: List[Dict], current_idx: int) -> Optional[Dict]:
        """Ekstrak data anggota keluarga dari baris tabel"""
        # Cari NIK di text
        nik_match = re.search(r'\b(\d{16})\b', text)
        if not nik_match:
            return None
        member = {
            'nik': nik_match.group(1),
            'nama': None,
            'jenis_kelamin': None,
            'tempat_lahir': None,
            'tanggal_lahir': None,
            'hubungan': None,
        }
        # Cari teks di sekitar yang mungkin nama atau info lain
        text_parts = text.split()
        # Deteksi jenis kelamin
        if 'laki' in text.lower() or ' l ' in f' {text.lower()} ':
            member['jenis_kelamin'] = 'LAKI-LAKI'
        elif 'perempuan' in text.lower() or ' p ' in f' {text.lower()} ':
            member['jenis_kelamin'] = 'PEREMPUAN'
        # Deteksi hubungan keluarga
        hubungan_keywords = {
            'kepala': 'KEPALA KELUARGA',
            'istri': 'ISTRI',
            'suami': 'SUAMI', 
            'anak': 'ANAK',
            'menantu': 'MENANTU',
            'cucu': 'CUCU',
            'orang tua': 'ORANG TUA',
            'mertua': 'MERTUA',
        }
        for keyword, value in hubungan_keywords.items():
            if keyword in text.lower():
                member['hubungan'] = value
                break
        return member
    def _post_process(self, result: Dict) -> Dict:
        """Post-processing hasil ekstraksi"""
        # Validasi No KK
        if result['no_kk'] and not re.match(r'^\d{16}$', result['no_kk']):
            cleaned = re.sub(r'\D', '', result['no_kk'])
            if len(cleaned) == 16:
                result['no_kk'] = cleaned
            else:
                result['no_kk'] = None
        # Uppercase field teks
        for field in ['nama_kepala_keluarga', 'alamat', 'kel_desa', 'kecamatan', 
                      'kabupaten_kota', 'provinsi']:
            if result[field]:
                result[field] = result[field].upper()
        return result
 if __name__ == "__main__":
    # Test
    sample_ocr = [
        {'text': 'KARTU KELUARGA', 'y_center': 10},
        {'text': 'No. 3204012345678901', 'y_center': 30},
        {'text': 'Nama Kepala Keluarga : JOHN DOE', 'y_center': 50},
        {'text': 'Alamat : JL. MERDEKA NO. 123', 'y_center': 70},
        {'text': 'RT/RW : 001/002', 'y_center': 90},
        {'text': 'Desa/Kelurahan : SUKAMAJU', 'y_center': 110},
        {'text': 'Kecamatan : SUKASARI', 'y_center': 130},
        {'text': 'Kabupaten/Kota : BANDUNG', 'y_center': 150},
        {'text': 'Provinsi : JAWA BARAT', 'y_center': 170},
        {'text': 'Kode Pos : 40154', 'y_center': 190},
    ]
    extractor = KKExtractor()
    result = extractor.extract(sample_ocr)
    for key, value in result.items():
        if key != 'anggota_keluarga':
            print(f"{key}: {value}")
    print(f"\nAnggota Keluarga: {len(result['anggota_keluarga'])} orang")
--- a/ktp.jpeg
+++ b/ktp.jpeg
--- a/ktp_extractor.py
+++ b/ktp_extractor.py
@@ -0,0 +1,602 @@
 """
 KTP Field Extractor
 Ekstraksi data terstruktur dari hasil OCR KTP Indonesia
 Mendukung berbagai format output OCR (full-width colon, standard colon, tanpa colon)
 """
 import re
 from typing import Dict, Optional, List
 class KTPExtractor:
    """Ekstrak field dari hasil OCR KTP"""
    # Pattern colon yang berbeda-beda (standard, full-width, dll)
    COLON_PATTERN = r'[:\：]'
    # Keywords untuk jenis kelamin
    MALE_KEYWORDS = ['laki', 'pria', 'male']
    FEMALE_KEYWORDS = ['perempuan', 'wanita', 'female']
    # Agama yang valid
    AGAMA_LIST = ['islam', 'kristen', 'katolik', 'hindu', 'budha', 'buddha', 'konghucu']
    # Pekerjaan umum
    PEKERJAAN_LIST = ['pelajar', 'mahasiswa', 'pegawai', 'swasta', 'pns', 'wiraswasta', 
                      'buruh', 'petani', 'nelayan', 'karyawan', 'ibu rumah tangga', 
                      'tidak bekerja', 'lainnya', 'mengurus rumah tangga']
    # KTP Zone Template (normalized coordinates: x_min, y_min, x_max, y_max)
    # Based on standard KTP layout
    ZONES = {
        'header_provinsi':  (0.15, 0.00, 0.85, 0.07),  # PROVINSI header
        'header_kabupaten': (0.15, 0.05, 0.85, 0.13),  # KABUPATEN header
        'nik':              (0.02, 0.10, 0.70, 0.22),  # NIK area
        'nama':             (0.02, 0.18, 0.70, 0.28),  # Nama area
        'ttl':              (0.02, 0.25, 0.70, 0.36),  # Tempat/Tgl Lahir
        'jenis_kelamin':    (0.02, 0.33, 0.45, 0.42),  # Jenis Kelamin (left)
        'gol_darah':        (0.40, 0.33, 0.70, 0.42),  # Gol Darah (right of jenis)
        'alamat':           (0.02, 0.38, 0.70, 0.50),  # Alamat
        'rt_rw':            (0.02, 0.46, 0.70, 0.54),  # RT/RW
        'kel_desa':         (0.02, 0.51, 0.70, 0.60),  # Kel/Desa
        'kecamatan':        (0.02, 0.57, 0.70, 0.66),  # Kecamatan
        'agama':            (0.02, 0.63, 0.70, 0.72),  # Agama
        'status':           (0.02, 0.69, 0.70, 0.78),  # Status Perkawinan
        'pekerjaan':        (0.02, 0.75, 0.70, 0.84),  # Pekerjaan
        'wni':              (0.02, 0.81, 0.70, 0.90),  # Kewarganegaraan
        'berlaku':          (0.02, 0.87, 0.70, 0.96),  # Berlaku Hingga
        'foto':             (0.68, 0.10, 0.98, 0.55),  # Foto (right side)
        'penerbitan':       (0.65, 0.58, 0.98, 0.98),  # Tempat & Tanggal penerbitan
    }
    def __init__(self):
        self.image_width = 0
        self.image_height = 0
    def _get_zone(self, x_center: float, y_center: float, img_width: int, img_height: int) -> Optional[str]:
        """Determine which zone a text belongs to based on normalized coordinates"""
        if img_width == 0 or img_height == 0:
            return None
        # Normalize coordinates
        x_norm = x_center / img_width
        y_norm = y_center / img_height
        for zone_name, (x_min, y_min, x_max, y_max) in self.ZONES.items():
            if x_min <= x_norm <= x_max and y_min <= y_norm <= y_max:
                return zone_name
        return None
    def _extract_value_from_text(self, text: str) -> str:
        """Extract value part from label:value text"""
        # Split by colon (standard or full-width)
        parts = re.split(r'[：:]', text, 1)
        if len(parts) > 1:
            return parts[1].strip()
        return text.strip()
    def _detect_image_size(self, ocr_results: List[Dict]) -> tuple:
        """Detect image dimensions from bounding boxes"""
        max_x, max_y = 0, 0
        for r in ocr_results:
            bbox = r.get('bbox', [])
            if bbox and len(bbox) >= 4:
                for point in bbox:
                    if len(point) >= 2:
                        max_x = max(max_x, point[0])
                        max_y = max(max_y, point[1])
        # Add some margin
        return (int(max_x * 1.05), int(max_y * 1.05)) if max_x > 0 else (1000, 640)
    def _extract_by_zones(self, zone_texts: Dict[str, List[str]], result: Dict):
        """Extract fields based on zone assignments"""
        # PROVINSI from header
        if 'header_provinsi' in zone_texts:
            for text in zone_texts['header_provinsi']:
                if 'provinsi' in text.lower():
                    val = re.sub(r'(?i)provinsi\s*', '', text).strip()
                    if val:
                        result['provinsi'] = val.upper()
                    break
        # KABUPATEN/KOTA from header
        if 'header_kabupaten' in zone_texts:
            for text in zone_texts['header_kabupaten']:
                text_lower = text.lower()
                if 'kabupaten' in text_lower or 'kota' in text_lower:
                    val = re.sub(r'(?i)(kabupaten|kota)\s*', '', text).strip()
                    if val:
                        result['kabupaten_kota'] = val.upper()
                    else:
                        result['kabupaten_kota'] = text.upper()
                    break
        # NAMA from nama zone (skip label line)
        if 'nama' in zone_texts:
            for text in zone_texts['nama']:
                text_lower = text.lower()
                if 'nama' not in text_lower and len(text) > 2:
                    result['nama'] = text.upper()
                    break
                elif 'nama' in text_lower:
                    val = self._extract_value_from_text(text)
                    if val and 'nama' not in val.lower():
                        result['nama'] = val.upper()
        # TTL from ttl zone
        if 'ttl' in zone_texts:
            for text in zone_texts['ttl']:
                if 'tempat' in text.lower() or 'lahir' in text.lower():
                    val = self._extract_value_from_text(text)
                    if val:
                        self._parse_ttl(val, result)
                        break
        # JENIS KELAMIN
        if 'jenis_kelamin' in zone_texts:
            for text in zone_texts['jenis_kelamin']:
                text_lower = text.lower()
                if 'laki' in text_lower:
                    result['jenis_kelamin'] = 'LAKI-LAKI'
                    break
                elif 'perempuan' in text_lower:
                    result['jenis_kelamin'] = 'PEREMPUAN'
                    break
        # GOL DARAH
        if 'gol_darah' in zone_texts:
            for text in zone_texts['gol_darah']:
                gol_match = re.search(r'([ABO]{1,2}[+\-]?)', text, re.IGNORECASE)
                if gol_match:
                    result['gol_darah'] = gol_match.group(1).upper()
                    break
        # ALAMAT
        if 'alamat' in zone_texts:
            for text in zone_texts['alamat']:
                if 'alamat' not in text.lower() or len(zone_texts['alamat']) == 1:
                    val = self._extract_value_from_text(text) if 'alamat' in text.lower() else text
                    if val and 'alamat' not in val.lower():
                        result['alamat'] = val.upper()
                        break
        # PENERBITAN area (tempat & tanggal dalam satu zona)
        if 'penerbitan' in zone_texts:
            for text in zone_texts['penerbitan']:
                # Look for date
                date_match = re.search(r'(\d{2}[-/]\d{2}[-/]\d{4})', text)
                if date_match and result['tanggal_penerbitan'] is None:
                    result['tanggal_penerbitan'] = date_match.group(1)
    def extract(self, ocr_results: List[Dict]) -> Dict[str, Optional[str]]:
        """
        Ekstrak field KTP dari hasil OCR dengan template-based zone detection
        Args:
            ocr_results: List hasil dari OCREngine.extract_text()
        Returns:
            Dict dengan field KTP
        """
        result = {
            'nik': None,
            'nama': None,
            'tempat_lahir': None,
            'tanggal_lahir': None,
            'jenis_kelamin': None,
            'gol_darah': None,
            'alamat': None,
            'rt_rw': None,
            'kel_desa': None,
            'kecamatan': None,
            'agama': None,
            'status_perkawinan': None,
            'pekerjaan': None,
            'kewarganegaraan': None,
            'berlaku_hingga': None,
            'provinsi': None,
            'kabupaten_kota': None,
            'tanggal_penerbitan': None,
        }
        # Detect image dimensions from bounding boxes
        img_width, img_height = self._detect_image_size(ocr_results)
        # Assign zones to each OCR result
        zone_texts = {}  # zone_name -> list of texts
        for r in ocr_results:
            x_center = r.get('x_center', 0)
            y_center = r.get('y_center', 0)
            zone = self._get_zone(x_center, y_center, img_width, img_height)
            if zone:
                if zone not in zone_texts:
                    zone_texts[zone] = []
                zone_texts[zone].append(r['text'])
        # Debug: print zone assignments
        print("\n[DEBUG KTPExtractor] Zone assignments:")
        for zone, texts in zone_texts.items():
            print(f"  {zone}: {texts}")
        # Extract fields using zone-based approach
        self._extract_by_zones(zone_texts, result)
        # Gabungkan semua teks untuk fallback pattern matching
        texts = [r['text'].strip() for r in ocr_results]
        all_text = '\n'.join(texts)
        # Ekstrak NIK (16 digit) - bisa ada di mana saja
        nik_match = re.search(r'\b(\d{16})\b', all_text)
        if nik_match:
            result['nik'] = nik_match.group(1)
            print(f"  -> NIK found: {result['nik']}")
        # Fallback: Parse line by line for fields not found by zone
        for i, text in enumerate(texts):
            text_lower = text.lower()
            # Normalize colons
            text_normalized = re.sub(self.COLON_PATTERN, ':', text)
            text_norm_lower = text_normalized.lower()
            # ===== PROVINSI =====
            if 'provinsi' in text_lower and result['provinsi'] is None:
                val = self._extract_after_label(text_normalized, 'provinsi')
                if val:
                    result['provinsi'] = val.upper()
                elif i + 1 < len(texts) and 'provinsi' not in texts[i+1].lower():
                    # Mungkin value di line berikutnya
                    result['provinsi'] = texts[i+1].strip().upper()
            # ===== KABUPATEN/KOTA =====
            if ('kabupaten' in text_lower or 'kota' in text_lower or 'jakarta' in text_lower) and result['kabupaten_kota'] is None:
                if 'provinsi' not in text_lower:  # Bukan bagian dari provinsi
                    val = self._extract_after_label(text_normalized, 'kabupaten|kota')
                    if val:
                        result['kabupaten_kota'] = val.upper()
                    else:
                        result['kabupaten_kota'] = text.strip().upper()
            # ===== NAMA =====
            if 'nama' in text_lower and result['nama'] is None:
                val = self._extract_after_label(text_normalized, 'nama')
                if val and len(val) > 2:
                    result['nama'] = val.upper()
                elif i + 1 < len(texts):
                    # Nama di line berikutnya
                    next_text = texts[i+1].strip()
                    if len(next_text) > 2 and not any(kw in next_text.lower() for kw in ['tempat', 'lahir', 'jenis']):
                        result['nama'] = next_text.upper()
            # ===== TEMPAT/TANGGAL LAHIR =====
            # Match "Tempat/Tgl Lahir" or "Tempat Lahir" or similar labels
            if 'tempat' in text_lower or ('lahir' in text_lower and 'berlaku' not in text_lower):
                if result['tempat_lahir'] is None or result['tanggal_lahir'] is None:
                    # Extract value after label using full-width or standard colon
                    ttl = self._extract_after_label(text_normalized, r'tempat[/\s]*tgl[/\s]*lahir|tempat[/\s]*lahir|lahir')
                    if ttl:
                        self._parse_ttl(ttl, result)
                    elif '：' in text or ':' in text:
                        # Value is after colon but _extract_after_label didn't catch it
                        parts = re.split(r'[：:]', text, 1)
                        if len(parts) > 1 and parts[1].strip():
                            self._parse_ttl(parts[1].strip(), result)
                    elif i + 1 < len(texts):
                        # TTL di line berikutnya
                        next_text = texts[i+1].strip()
                        if not any(kw in next_text.lower() for kw in ['jenis', 'kelamin', 'alamat', 'gol']):
                            self._parse_ttl(next_text, result)
            # ===== JENIS KELAMIN =====
            if any(kw in text_lower for kw in self.MALE_KEYWORDS):
                if result['jenis_kelamin'] is None:
                    result['jenis_kelamin'] = 'LAKI-LAKI'
            elif any(kw in text_lower for kw in self.FEMALE_KEYWORDS):
                if result['jenis_kelamin'] is None:
                    result['jenis_kelamin'] = 'PEREMPUAN'
            # ===== GOLONGAN DARAH =====
            if 'darah' in text_lower or 'gol.' in text_lower:
                # Try to find blood type on same line
                gol_match = re.search(r'(?:gol|darah)[.\s:：]*([ABO]{1,2}[+\-]?)', text, re.IGNORECASE)
                if gol_match and result['gol_darah'] is None:
                    result['gol_darah'] = gol_match.group(1).upper()
                elif result['gol_darah'] is None and i + 1 < len(texts):
                    # Blood type might be on next line (real KTP pattern)
                    next_text = texts[i+1].strip()
                    if re.match(r'^[ABO]{1,2}[+\-]?$', next_text, re.IGNORECASE):
                        result['gol_darah'] = next_text.upper()
            # Standalone blood type (e.g., just "O" or "A+" on its own line)
            if result['gol_darah'] is None:
                if re.match(r'^[ABO]{1,2}[+\-]?$', text.strip(), re.IGNORECASE) and len(text.strip()) <= 3:
                    result['gol_darah'] = text.strip().upper()
            # ===== ALAMAT =====
            if 'alamat' in text_lower and result['alamat'] is None:
                val = self._extract_after_label(text_normalized, 'alamat')
                if val:
                    result['alamat'] = val.upper()
                elif i + 1 < len(texts):
                    result['alamat'] = texts[i+1].strip().upper()
            # ===== RT/RW =====
            rt_rw_match = re.search(r'(\d{3})\s*/\s*(\d{3})', text)
            if rt_rw_match:
                result['rt_rw'] = f"{rt_rw_match.group(1)}/{rt_rw_match.group(2)}"
            # ===== KELURAHAN/DESA =====
            if ('kel' in text_lower or 'desa' in text_lower) and 'kelamin' not in text_lower:
                if result['kel_desa'] is None:
                    val = self._extract_after_label(text_normalized, 'kel|desa')
                    if val:
                        result['kel_desa'] = val.upper()
                    elif i + 1 < len(texts):
                        result['kel_desa'] = texts[i+1].strip().upper()
            # ===== KECAMATAN =====
            if 'kecamatan' in text_lower or ('kec' in text_lower and 'kelamin' not in text_lower):
                if result['kecamatan'] is None:
                    val = self._extract_after_label(text_normalized, 'kecamatan|kec')
                    if val:
                        result['kecamatan'] = val.upper()
                    elif i + 1 < len(texts):
                        # Value on next line (real KTP pattern)
                        next_text = texts[i+1].strip()
                        if len(next_text) > 2 and not any(kw in next_text.lower() for kw in ['agama', 'status', 'pekerjaan']):
                            result['kecamatan'] = next_text.upper()
            # ===== AGAMA =====
            if 'agama' in text_lower:
                val = self._extract_after_label(text_normalized, 'agama')
                if val and result['agama'] is None:
                    result['agama'] = val.upper()
                elif result['agama'] is None and i + 1 < len(texts):
                    # Value on next line (real KTP pattern)
                    next_text = texts[i+1].strip().upper()
                    if next_text in ['ISLAM', 'KRISTEN', 'KATOLIK', 'HINDU', 'BUDHA', 'BUDDHA', 'KONGHUCU']:
                        result['agama'] = next_text
            else:
                # Check if line contains only agama name
                for agama in self.AGAMA_LIST:
                    if agama in text_lower and len(text) < 20:
                        if result['agama'] is None:
                            result['agama'] = text.strip().upper()
                            break
            # ===== STATUS PERKAWINAN =====
            if 'kawin' in text_lower:
                if result['status_perkawinan'] is None:
                    val = self._extract_after_label(text_normalized, 'status.*kawin|perkawinan')
                    if val:
                        result['status_perkawinan'] = val.upper()
                    elif 'belum' in text_lower:
                        result['status_perkawinan'] = 'BELUM KAWIN'
                    elif 'kawin' in text_lower and 'cerai' not in text_lower:
                        result['status_perkawinan'] = 'KAWIN'
                    elif 'cerai hidup' in text_lower:
                        result['status_perkawinan'] = 'CERAI HIDUP'
                    elif 'cerai mati' in text_lower:
                        result['status_perkawinan'] = 'CERAI MATI'
            # ===== PEKERJAAN =====
            if 'pekerjaan' in text_lower:
                val = self._extract_after_label(text_normalized, 'pekerjaan')
                if val and result['pekerjaan'] is None:
                    result['pekerjaan'] = val.upper()
                elif result['pekerjaan'] is None and i + 1 < len(texts):
                    # Value on next line (real KTP pattern)
                    next_text = texts[i+1].strip()
                    if len(next_text) > 2 and 'kewarganegaraan' not in next_text.lower():
                        result['pekerjaan'] = next_text.upper()
            else:
                # Check if line contains pekerjaan keyword
                for pekerjaan in self.PEKERJAAN_LIST:
                    if pekerjaan in text_lower and len(text) < 30:
                        if result['pekerjaan'] is None:
                            result['pekerjaan'] = text.strip().upper()
                            break
            # ===== KEWARGANEGARAAN =====
            if 'wni' in text_lower:
                result['kewarganegaraan'] = 'WNI'
            elif 'wna' in text_lower:
                result['kewarganegaraan'] = 'WNA'
            elif 'warga' in text_lower and result['kewarganegaraan'] is None:
                val = self._extract_after_label(text_normalized, 'kewarganegaraan|warga')
                if val:
                    result['kewarganegaraan'] = val.upper()
            # ===== BERLAKU HINGGA =====
            if 'berlaku' in text_lower or 'seumur' in text_lower:
                if result['berlaku_hingga'] is None:
                    if 'seumur' in text_lower or 'hidup' in text_lower:
                        result['berlaku_hingga'] = 'SEUMUR HIDUP'
                    else:
                        val = self._extract_after_label(text_normalized, 'berlaku')
                        if val:
                            result['berlaku_hingga'] = val.upper()
            # ===== TANGGAL PENERBITAN (biasanya format DD-MM-YYYY di akhir) =====
            # Look for date that is NOT tanggal lahir (different date)
            if result['tanggal_penerbitan'] is None:
                # Match date format at end of text or standalone date
                date_match = re.search(r'(\d{2}[-/]\d{2}[-/]\d{4})$', text.strip())
                if date_match:
                    found_date = date_match.group(1)
                    # Make sure it's not the same as tanggal_lahir
                    if result['tanggal_lahir'] != found_date:
                        # Likely penerbitan if after berlaku_hingga was found
                        if result['berlaku_hingga'] or i > len(texts) * 0.7:
                            result['tanggal_penerbitan'] = found_date
        # Post-processing
        result = self._post_process(result)
        return result
    def _extract_after_label(self, text: str, label_pattern: str) -> Optional[str]:
        """Ekstrak nilai setelah label (supports various separators)"""
        patterns = [
            rf'(?:{label_pattern})\s*:\s*(.+)',  # label: value
            rf'(?:{label_pattern})\s+([A-Z0-9].+)',  # label VALUE (uppercase start)
        ]
        for pattern in patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                value = match.group(1).strip()
                # Remove trailing colon or label fragment
                value = re.sub(r'^[:\s]+', '', value)
                value = re.sub(r'\s*:\s*$', '', value)
                if value and len(value) > 1:
                    return value
        return None
    def _parse_ttl(self, ttl_text: str, result: Dict):
        """Parse tempat/tanggal lahir dari text"""
        ttl_text = ttl_text.strip()
        # Normalize dates where OCR missed dashes:
        # "05 08 1978" -> "05-08-1978"
        # "05 08-1978" -> "05-08-1978"  
        # "05-08 1978" -> "05-08-1978"
        ttl_text = re.sub(r'(\d{2})[\s]+(\d{2})[\s]+(\d{4})', r'\1-\2-\3', ttl_text)
        ttl_text = re.sub(r'(\d{2})[\s]+(\d{2})[-/](\d{4})', r'\1-\2-\3', ttl_text)
        ttl_text = re.sub(r'(\d{2})[-/](\d{2})[\s]+(\d{4})', r'\1-\2-\3', ttl_text)
        # Handle 8-digit date without separator: "05081978" -> "05-08-1978"
        date_8digit = re.search(r'(\d{8})', ttl_text)
        if date_8digit:
            d = date_8digit.group(1)
            formatted = f"{d[:2]}-{d[2:4]}-{d[4:]}"
            ttl_text = ttl_text.replace(d, formatted)
        # Handle merged city+date like "JAKARTA05-08-1978" - add space before digits
        ttl_text = re.sub(r'([A-Z])(\d{2}[-/])', r'\1 \2', ttl_text, flags=re.IGNORECASE)
        # Format: "TEMPAT, DD-MM-YYYY" atau "TEMPAT DD-MM-YYYY"
        date_match = re.search(r'(\d{2}[-/]\d{2}[-/]\d{4})', ttl_text)
        if date_match:
            result['tanggal_lahir'] = date_match.group(1)
            # Tempat adalah bagian sebelum tanggal
            place = ttl_text[:date_match.start()].strip(' ,:-/')
            # Clean up label remnants
            place = re.sub(r'^(tempat|tgl|lahir|：|:)[/\s:：]*', '', place, flags=re.IGNORECASE).strip()
            if place and len(place) > 2:
                result['tempat_lahir'] = place.upper()
        else:
            # Coba split by comma
            parts = ttl_text.split(',')
            if len(parts) >= 2:
                result['tempat_lahir'] = parts[0].strip().upper()
                result['tanggal_lahir'] = parts[1].strip()
            elif len(parts) == 1 and len(ttl_text) > 2:
                result['tempat_lahir'] = ttl_text.upper()
    def _post_process(self, result: Dict) -> Dict:
        """Post-processing hasil ekstraksi"""
        # Validasi NIK (harus 16 digit)
        if result['nik'] and not re.match(r'^\d{16}$', result['nik']):
            cleaned = re.sub(r'\D', '', result['nik'])
            if len(cleaned) == 16:
                result['nik'] = cleaned
            else:
                result['nik'] = None
        # Clean all string values - remove leading colons and extra whitespace
        for field in result:
            if result[field] and isinstance(result[field], str):
                val = result[field]
                # Remove leading colons (standard and full-width)
                val = re.sub(r'^[\s:：]+', '', val)
                # Remove trailing colons
                val = re.sub(r'[\s:：]+$', '', val)
                # Remove double spaces
                val = re.sub(r'\s+', ' ', val)
                result[field] = val.strip()
        # Bersihkan label dari values
        for field in ['nama', 'alamat', 'tempat_lahir', 'kel_desa', 'kecamatan', 'agama', 'pekerjaan']:
            if result[field]:
                # Remove common labels yang ter-capture
                result[field] = re.sub(
                    r'^(NAMA|ALAMAT|TEMPAT|LAHIR|TGL|KEL|DESA|KELURAHAN|KECAMATAN|KEC|AGAMA|PEKERJAAN|STATUS)[\s:：]*', 
                    '', result[field], flags=re.IGNORECASE
                ).strip()
        # Fix status perkawinan yang masih mengandung label
        if result['status_perkawinan']:
            sp = result['status_perkawinan']
            sp = re.sub(r'^(STATUS|PERKAWINAN)[\s:：]*', '', sp, flags=re.IGNORECASE).strip()
            result['status_perkawinan'] = sp
        # Fix berlaku hingga
        if result['berlaku_hingga']:
            bh = result['berlaku_hingga']
            bh = re.sub(r'^(BERLAKU|HINGGA)[\s:：]*', '', bh, flags=re.IGNORECASE).strip()
            if bh.upper() == 'HIDUP' or 'SEUMUR' in bh.upper():
                result['berlaku_hingga'] = 'SEUMUR HIDUP'
            else:
                result['berlaku_hingga'] = bh
        # Fix merged kabupaten/kota names (e.g., JAKARTASELATAN -> JAKARTA SELATAN)
        if result['kabupaten_kota']:
            kk = result['kabupaten_kota']
            # Add space before directional words
            kk = re.sub(r'(JAKARTA|BANDUNG|SURABAYA|SEMARANG|MEDAN|BEKASI|TANGERANG|DEPOK|BOGOR)(SELATAN|UTARA|BARAT|TIMUR|PUSAT|TENGAH)', 
                        r'\1 \2', kk, flags=re.IGNORECASE)
            # Common merged patterns
            kk = re.sub(r'(KOTA|KABUPATEN)([A-Z])', r'\1 \2', kk, flags=re.IGNORECASE)
            result['kabupaten_kota'] = kk.upper()
        # Fix merged provinsi names
        if result['provinsi']:
            prov = result['provinsi']
            prov = re.sub(r'(DKI|DI)(JAKARTA|YOGYAKARTA)', r'\1 \2', prov, flags=re.IGNORECASE)
            prov = re.sub(r'(JAWA|KALIMANTAN|SULAWESI|SUMATERA|NUSA TENGGARA)(BARAT|TIMUR|TENGAH|SELATAN|UTARA)', 
                          r'\1 \2', prov, flags=re.IGNORECASE)
            result['provinsi'] = prov.upper()
        # Fix merged alamat/address (e.g., JLKECAPIV -> JL KECAPI V)
        if result['alamat']:
            alamat = result['alamat']
            # Add space after common street prefixes
            alamat = re.sub(r'^(JL|JLN|JALAN|GG|GANG|NO|BLOK)([A-Z])', r'\1 \2', alamat, flags=re.IGNORECASE)
            # Add space before Roman numerals at the end (I, II, III, IV, V, VI, VII, VIII, IX, X)
            alamat = re.sub(r'([A-Z])([IVX]+)$', r'\1 \2', alamat, flags=re.IGNORECASE)
            # Add space before single digits/numbers at end
            alamat = re.sub(r'([A-Z])(\d+)$', r'\1 \2', alamat, flags=re.IGNORECASE)
            # Fix common patterns: "NO123" -> "NO 123", "BLOKA" -> "BLOK A"
            alamat = re.sub(r'\b(NO|BLOK)(\d+|[A-Z])\b', r'\1 \2', alamat, flags=re.IGNORECASE)
            result['alamat'] = alamat.upper()
        return result
 if __name__ == "__main__":
    # Test
    sample_ocr = [
        {'text': 'PROVINSI JAWA BARAT'},
        {'text': 'KABUPATEN BANDUNG'},
        {'text': 'NIK : 3204012345678901'},
        {'text': 'Nama : JOHN DOE'},
        {'text': 'Tempat/Tgl Lahir : BANDUNG, 01-01-1990'},
        {'text': 'Jenis Kelamin : LAKI-LAKI'},
        {'text': 'Alamat : JL. MERDEKA NO. 123'},
        {'text': 'RT/RW : 001/002'},
        {'text': 'Kel/Desa : SUKAMAJU'},
        {'text': 'Kecamatan : SUKASARI'},
        {'text': 'Agama : ISLAM'},
        {'text': 'Status Perkawinan : BELUM KAWIN'},
        {'text': 'Pekerjaan : KARYAWAN SWASTA'},
        {'text': 'Kewarganegaraan : WNI'},
        {'text': 'Berlaku Hingga : SEUMUR HIDUP'},
    ]
    extractor = KTPExtractor()
    result = extractor.extract(sample_ocr)
    for key, value in result.items():
        print(f"{key}: {value}")
--- a/ocr_engine.py
+++ b/ocr_engine.py
@@ -0,0 +1,153 @@
 """
 OCR Engine menggunakan PaddleOCR 3.x
 Untuk membaca teks dari gambar dokumen Indonesia (KTP, KK)
 """
 from paddleocr import PaddleOCR
 import cv2
 import numpy as np
 from PIL import Image
 class OCREngine:
    def __init__(self):
        """Inisialisasi PaddleOCR 3.x dengan konfigurasi untuk dokumen Indonesia"""
        self.ocr = PaddleOCR(
            use_doc_orientation_classify=True,   # Deteksi rotasi (0°/90°/180°/270°)
            use_doc_unwarping=True,              # Koreksi perspektif (trapezium → persegi)
            use_textline_orientation=True,       # Orientasi per baris teks
        )
    def preprocess_image(self, image_path: str) -> np.ndarray:
        """
        Preprocessing gambar untuk hasil OCR lebih baik
        - Resize jika terlalu besar
        - Enhance contrast
        """
        img = cv2.imread(image_path)
        if img is None:
            raise ValueError(f"Tidak dapat membaca gambar: {image_path}")
        # Resize jika terlalu besar (max 2000px)
        max_dim = 2000
        height, width = img.shape[:2]
        if max(height, width) > max_dim:
            scale = max_dim / max(height, width)
            img = cv2.resize(img, None, fx=scale, fy=scale, interpolation=cv2.INTER_AREA)
        # Convert ke grayscale untuk preprocessing
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        # Enhance contrast menggunakan CLAHE
        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
        enhanced = clahe.apply(gray)
        # Convert kembali ke BGR untuk PaddleOCR
        enhanced_bgr = cv2.cvtColor(enhanced, cv2.COLOR_GRAY2BGR)
        return enhanced_bgr
    def extract_text(self, image_path: str, preprocess: bool = False) -> list:
        """
        Ekstraksi teks dari gambar menggunakan PaddleOCR 3.x API
        Args:
            image_path: Path ke file gambar
            preprocess: Apakah melakukan preprocessing
        Returns:
            List of dict dengan keys: 'text', 'confidence', 'bbox'
        """
        try:
            # Jalankan OCR dengan API baru (predict)
            result = self.ocr.predict(input=image_path)
            if not result:
                return []
            extracted = []
            # Parse hasil dari PaddleOCR 3.x
            for res in result:
                # Akses data dari result object
                if hasattr(res, 'rec_texts') and hasattr(res, 'rec_scores') and hasattr(res, 'dt_polys'):
                    texts = res.rec_texts if res.rec_texts else []
                    scores = res.rec_scores if res.rec_scores else []
                    polys = res.dt_polys if res.dt_polys else []
                    for i, text in enumerate(texts):
                        confidence = scores[i] if i < len(scores) else 0.0
                        bbox = polys[i].tolist() if i < len(polys) and hasattr(polys[i], 'tolist') else []
                        # Calculate center for sorting
                        if bbox and len(bbox) >= 4:
                            y_center = (bbox[0][1] + bbox[2][1]) / 2
                            x_center = (bbox[0][0] + bbox[2][0]) / 2
                        else:
                            y_center = 0
                            x_center = 0
                        extracted.append({
                            'text': text,
                            'confidence': float(confidence),
                            'bbox': bbox,
                            'y_center': y_center,
                            'x_center': x_center,
                        })
                # Fallback: try dict-like access
                elif hasattr(res, '__getitem__'):
                    try:
                        texts = res.get('rec_texts', res.get('texts', []))
                        scores = res.get('rec_scores', res.get('scores', []))
                        for i, text in enumerate(texts):
                            confidence = scores[i] if i < len(scores) else 0.0
                            extracted.append({
                                'text': text,
                                'confidence': float(confidence),
                                'bbox': [],
                                'y_center': i * 10,  # Simple ordering fallback
                                'x_center': 0,
                            })
                    except Exception:
                        pass
            # Sort berdasarkan posisi Y (atas ke bawah)
            if extracted:
                extracted.sort(key=lambda x: (x['y_center'], x['x_center']))
            return extracted
        except Exception as e:
            print(f"Error OCR: {e}")
            import traceback
            traceback.print_exc()
            return []
    def get_raw_text(self, image_path: str) -> str:
        """
        Mendapatkan semua teks dari gambar sebagai string
        """
        results = self.extract_text(image_path)
        return '\n'.join([r['text'] for r in results])
 # Singleton instance
 _ocr_engine = None
 def get_ocr_engine() -> OCREngine:
    """Get singleton OCR engine instance"""
    global _ocr_engine
    if _ocr_engine is None:
        _ocr_engine = OCREngine()
    return _ocr_engine
 if __name__ == "__main__":
    # Test OCR
    import sys
    if len(sys.argv) > 1:
        engine = get_ocr_engine()
        results = engine.extract_text(sys.argv[1])
        for r in results:
            print(f"[{r['confidence']:.2f}] {r['text']}")
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,5 @@
 paddlepaddle
 paddleocr
 flask
 pillow
 opencv-python
--- a/static/style.css
+++ b/static/style.css
@@ -0,0 +1,538 @@
 /* OCR KTP/KK - Modern Dark Theme */
 :root {
    --bg-primary: #0f0f1a;
    --bg-secondary: #1a1a2e;
    --bg-tertiary: #252540;
    --accent-primary: #6366f1;
    --accent-secondary: #818cf8;
    --accent-gradient: linear-gradient(135deg, #6366f1 0%, #a855f7 100%);
    --text-primary: #f1f5f9;
    --text-secondary: #94a3b8;
    --text-muted: #64748b;
    --success: #22c55e;
    --error: #ef4444;
    --warning: #f59e0b;
    --border: #334155;
    --shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.3);
    --shadow-lg: 0 10px 15px -3px rgba(0, 0, 0, 0.4);
    --radius: 12px;
    --radius-lg: 16px;
 }
 * {
    margin: 0;
    padding: 0;
    box-sizing: border-box;
 }
 body {
    font-family: 'Segoe UI', system-ui, -apple-system, sans-serif;
    background: var(--bg-primary);
    color: var(--text-primary);
    min-height: 100vh;
    line-height: 1.6;
 }
 .container {
    max-width: 800px;
    margin: 0 auto;
    padding: 2rem 1rem;
 }
 /* Header */
 header {
    text-align: center;
    margin-bottom: 2rem;
 }
 header h1 {
    font-size: 2.5rem;
    font-weight: 700;
    background: var(--accent-gradient);
    -webkit-background-clip: text;
    -webkit-text-fill-color: transparent;
    background-clip: text;
    margin-bottom: 0.5rem;
 }
 .subtitle {
    color: var(--text-secondary);
    font-size: 1.1rem;
 }
 /* Upload Section */
 .upload-section {
    background: var(--bg-secondary);
    border-radius: var(--radius-lg);
    padding: 2rem;
    box-shadow: var(--shadow-lg);
    margin-bottom: 2rem;
 }
 /* Document Type Selector */
 .doc-type-selector {
    display: flex;
    gap: 1rem;
    margin-bottom: 1.5rem;
 }
 .doc-btn {
    flex: 1;
    display: flex;
    align-items: center;
    justify-content: center;
    gap: 0.5rem;
    padding: 1rem;
    background: var(--bg-tertiary);
    border: 2px solid transparent;
    border-radius: var(--radius);
    color: var(--text-secondary);
    font-size: 1rem;
    font-weight: 600;
    cursor: pointer;
    transition: all 0.3s ease;
 }
 .doc-btn:hover {
    background: var(--bg-primary);
    color: var(--text-primary);
 }
 .doc-btn.active {
    background: var(--accent-gradient);
    color: white;
    border-color: var(--accent-secondary);
 }
 .doc-btn .icon {
    font-size: 1.5rem;
 }
 /* Dropzone */
 .dropzone {
    border: 2px dashed var(--border);
    border-radius: var(--radius);
    padding: 3rem 2rem;
    text-align: center;
    cursor: pointer;
    transition: all 0.3s ease;
    background: var(--bg-tertiary);
    position: relative;
    overflow: hidden;
 }
 .dropzone:hover,
 .dropzone.dragover {
    border-color: var(--accent-primary);
    background: rgba(99, 102, 241, 0.1);
 }
 .dropzone-content {
    display: flex;
    flex-direction: column;
    align-items: center;
    gap: 0.5rem;
 }
 .upload-icon {
    font-size: 4rem;
    margin-bottom: 0.5rem;
 }
 .dropzone p {
    color: var(--text-secondary);
 }
 .dropzone .hint {
    color: var(--text-muted);
    font-size: 0.875rem;
 }
 .file-btn {
    display: inline-block;
    padding: 0.75rem 1.5rem;
    background: var(--accent-gradient);
    color: white;
    border-radius: var(--radius);
    font-weight: 600;
    cursor: pointer;
    margin: 0.5rem 0;
    transition: transform 0.2s ease;
 }
 .file-btn:hover {
    transform: scale(1.05);
 }
 .file-types {
    font-size: 0.75rem;
    color: var(--text-muted);
 }
 .preview-image {
    max-width: 100%;
    max-height: 400px;
    border-radius: var(--radius);
    cursor: pointer;
 }
 /* Process Button */
 .process-btn {
    width: 100%;
    padding: 1rem;
    margin-top: 1.5rem;
    background: var(--accent-gradient);
    border: none;
    border-radius: var(--radius);
    color: white;
    font-size: 1.1rem;
    font-weight: 600;
    cursor: pointer;
    transition: all 0.3s ease;
    box-shadow: var(--shadow);
 }
 .process-btn:hover:not(:disabled) {
    transform: translateY(-2px);
    box-shadow: var(--shadow-lg);
 }
 .process-btn:disabled {
    opacity: 0.5;
    cursor: not-allowed;
 }
 /* Results Section */
 .results-section {
    background: var(--bg-secondary);
    border-radius: var(--radius-lg);
    padding: 2rem;
    box-shadow: var(--shadow-lg);
    animation: slideUp 0.3s ease;
 }
@keyframes slideUp {
    from {
        opacity: 0;
        transform: translateY(20px);
    }
    to {
        opacity: 1;
        transform: translateY(0);
    }
 }
 .results-header {
    display: flex;
    justify-content: space-between;
    align-items: center;
    margin-bottom: 1.5rem;
    flex-wrap: wrap;
    gap: 1rem;
 }
 .results-header h2 {
    font-size: 1.5rem;
 }
 .results-actions {
    display: flex;
    gap: 0.5rem;
 }
 .action-btn {
    padding: 0.5rem 1rem;
    background: var(--bg-tertiary);
    border: 1px solid var(--border);
    border-radius: var(--radius);
    color: var(--text-primary);
    font-size: 0.875rem;
    cursor: pointer;
    transition: all 0.2s ease;
 }
 .action-btn:hover {
    background: var(--accent-primary);
    border-color: var(--accent-primary);
 }
 .action-btn.secondary {
    background: transparent;
 }
 /* Results Table */
 .results-table {
    width: 100%;
    border-collapse: collapse;
 }
 .results-table th,
 .results-table td {
    padding: 0.875rem 1rem;
    text-align: left;
    border-bottom: 1px solid var(--border);
 }
 .results-table th {
    background: var(--bg-tertiary);
    color: var(--text-secondary);
    font-weight: 600;
    font-size: 0.875rem;
    text-transform: uppercase;
    letter-spacing: 0.05em;
 }
 .results-table th:first-child {
    border-radius: var(--radius) 0 0 0;
 }
 .results-table th:last-child {
    border-radius: 0 var(--radius) 0 0;
 }
 .field-label {
    color: var(--text-secondary);
    font-weight: 500;
    width: 40%;
 }
 .field-value {
    color: var(--text-primary);
    font-weight: 600;
 }
 .results-table tr:hover {
    background: rgba(99, 102, 241, 0.05);
 }
 /* Editable Fields */
 .editable-field {
    width: 100%;
    padding: 0.5rem 0.75rem;
    background: var(--bg-tertiary);
    border: 1px solid var(--border);
    border-radius: 6px;
    color: var(--text-primary);
    font-size: 0.95rem;
    font-weight: 600;
    font-family: inherit;
    transition: all 0.2s ease;
 }
 .editable-field:focus {
    outline: none;
    border-color: var(--accent-primary);
    background: var(--bg-secondary);
    box-shadow: 0 0 0 3px rgba(99, 102, 241, 0.2);
 }
 .editable-field::placeholder {
    color: var(--text-muted);
    font-weight: 400;
 }
 /* Region Dropdown Styles */
 .region-field-wrapper {
    display: flex;
    gap: 0.5rem;
    align-items: center;
 }
 .region-field-wrapper input,
 .region-field-wrapper select {
    flex: 1;
 }
 .region-dropdown {
    width: 100%;
    padding: 0.5rem 0.75rem;
    background: var(--bg-tertiary);
    border: 1px solid var(--border);
    border-radius: 6px;
    color: var(--text-primary);
    font-size: 0.95rem;
    font-family: inherit;
    cursor: pointer;
 }
 .region-dropdown:focus {
    outline: none;
    border-color: var(--accent-primary);
 }
 .dropdown-toggle {
    padding: 0.5rem 0.75rem;
    background: var(--bg-tertiary);
    border: 1px solid var(--border);
    border-radius: 6px;
    color: var(--text-secondary);
    cursor: pointer;
    transition: all 0.2s ease;
    flex-shrink: 0;
 }
 .dropdown-toggle:hover {
    background: var(--accent-primary);
    color: white;
 }
 .dropdown-toggle.confirmed {
    background: var(--success);
    color: white;
    border-color: var(--success);
 }
 /* Validation Indicators */
 .validation-status {
    margin-left: 0.5rem;
    font-size: 0.875rem;
 }
 .validation-status.valid-field {
    color: var(--success);
 }
 .validation-status.invalid-field {
    color: var(--warning);
 }
 .editable-field.valid-field {
    border-color: var(--success);
 }
 .editable-field.invalid-field {
    border-color: var(--warning);
 }
 .suggestion-text {
    font-size: 0.75rem;
    color: var(--text-muted);
    margin-top: 0.25rem;
    font-style: italic;
 }
 /* Raw Text Section */
 .raw-text-section {
    margin-top: 1.5rem;
    padding-top: 1.5rem;
    border-top: 1px solid var(--border);
 }
 .raw-text-section h3 {
    font-size: 1rem;
    color: var(--text-secondary);
    margin-bottom: 1rem;
 }
 .raw-text-section pre {
    background: var(--bg-primary);
    padding: 1rem;
    border-radius: var(--radius);
    font-family: 'Consolas', monospace;
    font-size: 0.875rem;
    color: var(--text-secondary);
    white-space: pre-wrap;
    word-wrap: break-word;
    max-height: 300px;
    overflow-y: auto;
 }
 /* Error Section */
 .error-section {
    margin-top: 1rem;
 }
 .error-content {
    background: rgba(239, 68, 68, 0.1);
    border: 1px solid var(--error);
    border-radius: var(--radius);
    padding: 1rem;
    display: flex;
    align-items: center;
    gap: 0.75rem;
 }
 .error-icon {
    font-size: 1.5rem;
 }
 .error-content p {
    color: var(--error);
 }
 /* Footer */
 footer {
    text-align: center;
    margin-top: 2rem;
    padding-top: 1rem;
    border-top: 1px solid var(--border);
 }
 footer p {
    color: var(--text-muted);
    font-size: 0.875rem;
 }
 footer a {
    color: var(--accent-secondary);
    text-decoration: none;
 }
 footer a:hover {
    text-decoration: underline;
 }
 /* Responsive */
@media (max-width: 600px) {
    .container {
        padding: 1rem;
    }
    header h1 {
        font-size: 2rem;
    }
    .upload-section,
    .results-section {
        padding: 1.5rem;
    }
    .doc-type-selector {
        flex-direction: column;
    }
    .results-header {
        flex-direction: column;
        align-items: flex-start;
    }
    .results-actions {
        width: 100%;
        justify-content: flex-start;
    }
    .field-label {
        width: 45%;
    }
 }
 /* Scrollbar */
 ::-webkit-scrollbar {
    width: 8px;
    height: 8px;
 }
 ::-webkit-scrollbar-track {
    background: var(--bg-tertiary);
 }
 ::-webkit-scrollbar-thumb {
    background: var(--border);
    border-radius: 4px;
 }
 ::-webkit-scrollbar-thumb:hover {
    background: var(--text-muted);
 }
--- a/templates/index.html
+++ b/templates/index.html
@@ -0,0 +1,570 @@
 <!DOCTYPE html>
 <html lang="id">
 <head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>OCR KTP/KK - Pembaca Dokumen Indonesia</title>
    <link rel="stylesheet" href="/static/style.css">
 </head>
 <body>
    <div class="container">
        <header>
            <h1>📄 OCR KTP/KK</h1>
            <p class="subtitle">Pembaca Dokumen Indonesia Offline</p>
        </header>
        <main>
            <!-- Upload Section -->
            <section class="upload-section">
                <div class="doc-type-selector">
                    <button class="doc-btn active" data-type="ktp">
                        <span class="icon">🪪</span>
                        KTP
                    </button>
                    <button class="doc-btn" data-type="kk">
                        <span class="icon">👨‍👩‍👧‍👦</span>
                        Kartu Keluarga
                    </button>
                </div>
                <div class="dropzone" id="dropzone">
                    <div class="dropzone-content">
                        <div class="upload-icon">📷</div>
                        <p>Drag & drop gambar di sini</p>
                        <p class="hint">atau</p>
                        <label class="file-btn">
                            Pilih File
                            <input type="file" id="fileInput" accept="image/*" hidden>
                        </label>
                        <p class="file-types">PNG, JPG, JPEG, BMP, WEBP (max 16MB)</p>
                    </div>
                    <img id="preview" class="preview-image" style="display: none;">
                </div>
                <button id="processBtn" class="process-btn" disabled>
                    <span class="btn-text">🔍 Proses OCR</span>
                    <span class="btn-loading" style="display: none;">⏳ Memproses...</span>
                </button>
            </section>
            <!-- Results Section -->
            <section class="results-section" id="resultsSection" style="display: none;">
                <div class="results-header">
                    <h2>📋 Hasil Ekstraksi</h2>
                    <div class="results-actions">
                        <button class="action-btn" id="copyBtn" title="Copy JSON">📋 Copy</button>
                        <button class="action-btn" id="exportBtn" title="Export JSON">💾 Export</button>
                        <button class="action-btn secondary" id="toggleRaw">📝 Raw Text</button>
                    </div>
                </div>
                <div class="results-content">
                    <table class="results-table" id="resultsTable">
                        <thead>
                            <tr>
                                <th>Field</th>
                                <th>Nilai</th>
                            </tr>
                        </thead>
                        <tbody id="resultsBody">
                        </tbody>
                    </table>
                    <div class="raw-text-section" id="rawTextSection" style="display: none;">
                        <h3>Raw OCR Text</h3>
                        <pre id="rawText"></pre>
                    </div>
                </div>
            </section>
            <!-- Error Section -->
            <section class="error-section" id="errorSection" style="display: none;">
                <div class="error-content">
                    <span class="error-icon">⚠️</span>
                    <p id="errorMessage"></p>
                </div>
            </section>
        </main>
        <footer>
            <p>OCR menggunakan <a href="https://github.com/PaddlePaddle/PaddleOCR" target="_blank">PaddleOCR</a> • Data
                diproses secara lokal</p>
        </footer>
    </div>
    <script>
        // State
        let selectedFile = null;
        let docType = 'ktp';
        let extractedData = null;
        // Elements
        const dropzone = document.getElementById('dropzone');
        const fileInput = document.getElementById('fileInput');
        const preview = document.getElementById('preview');
        const processBtn = document.getElementById('processBtn');
        const resultsSection = document.getElementById('resultsSection');
        const resultsBody = document.getElementById('resultsBody');
        const rawText = document.getElementById('rawText');
        const rawTextSection = document.getElementById('rawTextSection');
        const errorSection = document.getElementById('errorSection');
        const errorMessage = document.getElementById('errorMessage');
        const docBtns = document.querySelectorAll('.doc-btn');
        // Field labels untuk display
        const fieldLabels = {
            // KTP
            'nik': 'NIK',
            'nama': 'Nama',
            'tempat_lahir': 'Tempat Lahir',
            'tanggal_lahir': 'Tanggal Lahir',
            'jenis_kelamin': 'Jenis Kelamin',
            'gol_darah': 'Gol. Darah',
            'alamat': 'Alamat',
            'rt_rw': 'RT/RW',
            'kel_desa': 'Kel/Desa',
            'kecamatan': 'Kecamatan',
            'agama': 'Agama',
            'status_perkawinan': 'Status Perkawinan',
            'pekerjaan': 'Pekerjaan',
            'kewarganegaraan': 'Kewarganegaraan',
            'berlaku_hingga': 'Berlaku Hingga',
            'provinsi': 'Provinsi',
            'kabupaten_kota': 'Kabupaten/Kota',
            'tanggal_penerbitan': 'Tanggal Penerbitan',
            // KK
            'no_kk': 'No. KK',
            'nama_kepala_keluarga': 'Kepala Keluarga',
            'kode_pos': 'Kode Pos',
            'anggota_keluarga': 'Jumlah Anggota'
        };
        // Doc type selection
        docBtns.forEach(btn => {
            btn.addEventListener('click', () => {
                docBtns.forEach(b => b.classList.remove('active'));
                btn.classList.add('active');
                docType = btn.dataset.type;
            });
        });
        // Drag & drop
        dropzone.addEventListener('dragover', (e) => {
            e.preventDefault();
            dropzone.classList.add('dragover');
        });
        dropzone.addEventListener('dragleave', () => {
            dropzone.classList.remove('dragover');
        });
        dropzone.addEventListener('drop', (e) => {
            e.preventDefault();
            dropzone.classList.remove('dragover');
            const files = e.dataTransfer.files;
            if (files.length > 0) {
                handleFile(files[0]);
            }
        });
        // File input
        fileInput.addEventListener('change', (e) => {
            if (e.target.files.length > 0) {
                handleFile(e.target.files[0]);
            }
        });
        // Click on dropzone
        dropzone.addEventListener('click', (e) => {
            if (e.target === dropzone || e.target.closest('.dropzone-content')) {
                fileInput.click();
            }
        });
        function handleFile(file) {
            if (!file.type.startsWith('image/')) {
                showError('File harus berupa gambar');
                return;
            }
            if (file.size > 16 * 1024 * 1024) {
                showError('Ukuran file maksimal 16MB');
                return;
            }
            selectedFile = file;
            // Show preview
            const reader = new FileReader();
            reader.onload = (e) => {
                preview.src = e.target.result;
                preview.style.display = 'block';
                dropzone.querySelector('.dropzone-content').style.display = 'none';
            };
            reader.readAsDataURL(file);
            processBtn.disabled = false;
            hideError();
            resultsSection.style.display = 'none';
        }
        // Process button
        processBtn.addEventListener('click', async () => {
            if (!selectedFile) return;
            const btnText = processBtn.querySelector('.btn-text');
            const btnLoading = processBtn.querySelector('.btn-loading');
            processBtn.disabled = true;
            btnText.style.display = 'none';
            btnLoading.style.display = 'inline';
            try {
                const formData = new FormData();
                formData.append('file', selectedFile);
                formData.append('doc_type', docType);
                const response = await fetch('/upload', {
                    method: 'POST',
                    body: formData
                });
                const result = await response.json();
                if (result.success) {
                    extractedData = result.data;
                    displayResults(result);
                    hideError();
                } else {
                    showError(result.error);
                    resultsSection.style.display = 'none';
                }
            } catch (error) {
                showError('Terjadi kesalahan: ' + error.message);
            } finally {
                processBtn.disabled = false;
                btnText.style.display = 'inline';
                btnLoading.style.display = 'none';
            }
        });
        // Region fields that use dropdowns - in hierarchical order
        const regionFields = ['provinsi', 'kabupaten_kota', 'kecamatan', 'kel_desa'];
        let regionData = {
            provinces: [],
            regencies: {},
            districts: {},
            villages: {}
        };
        let validationResult = null;
        // Define field display order
        const fieldOrder = [
            // Location hierarchy first
            'provinsi', 'kabupaten_kota', 'kecamatan', 'kel_desa',
            // Identity
            'nik', 'nama', 'tempat_lahir', 'tanggal_lahir', 'jenis_kelamin', 'gol_darah',
            // Address
            'alamat', 'rt_rw',
            // Other info
            'agama', 'status_perkawinan', 'pekerjaan', 'kewarganegaraan', 'berlaku_hingga',
            // Issue date
            'tanggal_penerbitan',
            // KK specific
            'no_kk', 'nama_kepala_keluarga', 'kode_pos', 'anggota_keluarga'
        ];
        async function displayResults(result) {
            resultsBody.innerHTML = '';
            const data = result.data;
            extractedData = data;
            // Validate region data first
            await validateRegionData(data);
            // Sort keys by fieldOrder
            const sortedKeys = Object.keys(data).sort((a, b) => {
                const indexA = fieldOrder.indexOf(a);
                const indexB = fieldOrder.indexOf(b);
                if (indexA === -1 && indexB === -1) return 0;
                if (indexA === -1) return 1;
                if (indexB === -1) return -1;
                return indexA - indexB;
            });
            for (const key of sortedKeys) {
                const value = data[key];
                if (key === 'anggota_keluarga') {
                    const count = Array.isArray(value) ? value.length : 0;
                    addResultRow('Jumlah Anggota', count + ' orang', null, false);
                } else if (regionFields.includes(key)) {
                    // Region field with dropdown
                    const label = fieldLabels[key] || key;
                    await addRegionRow(label, value || '', key);
                } else {
                    const label = fieldLabels[key] || key;
                    addResultRow(label, value || '', key, true);
                }
            }
            rawText.textContent = result.raw_text;
            resultsSection.style.display = 'block';
            resultsSection.scrollIntoView({ behavior: 'smooth' });
        }
        async function validateRegionData(data) {
            try {
                const response = await fetch('/api/validate-region', {
                    method: 'POST',
                    headers: { 'Content-Type': 'application/json' },
                    body: JSON.stringify(data)
                });
                const result = await response.json();
                if (result.success) {
                    validationResult = result.validation;
                }
            } catch (e) {
                console.error('Validation error:', e);
            }
        }
        async function addRegionRow(label, value, key) {
            const row = document.createElement('tr');
            const validation = validationResult?.[key];
            const isValid = validation?.valid;
            const suggestion = validation?.suggestion;
            // Status indicator
            const statusIcon = isValid ? '✓' : (value ? '⚠' : '');
            const statusClass = isValid ? 'valid-field' : (value ? 'invalid-field' : '');
            row.innerHTML = `
                <td class="field-label">
                    ${label}
                    <span class="validation-status ${statusClass}">${statusIcon}</span>
                </td>
                <td class="field-value">
                    <div class="region-field-wrapper">
                        <input type="text" class="editable-field ${statusClass}" data-key="${key}" 
                               value="${suggestion || value || ''}" placeholder="Ketik atau pilih...">
                        <select class="region-dropdown" data-key="${key}" style="display: none;">
                            <option value="">-- Pilih --</option>
                        </select>
                        <button type="button" class="dropdown-toggle" data-key="${key}" title="Pilih dari daftar">▼</button>
                    </div>
                    ${suggestion && suggestion !== value ? `<div class="suggestion-text">Saran: ${suggestion}</div>` : ''}
                </td>
            `;
            const input = row.querySelector('input');
            const select = row.querySelector('select');
            const toggleBtn = row.querySelector('.dropdown-toggle');
            // Input change
            input.addEventListener('input', (e) => {
                if (extractedData) {
                    extractedData[key] = e.target.value;
                }
            });
            // Toggle dropdown
            toggleBtn.addEventListener('click', async () => {
                if (select.style.display === 'none') {
                    await loadDropdownOptions(key, select);
                    select.style.display = 'block';
                    input.style.display = 'none';
                } else {
                    select.style.display = 'none';
                    input.style.display = 'block';
                }
            });
            // Select change
            select.addEventListener('change', (e) => {
                const selectedOption = e.target.options[e.target.selectedIndex];
                const selectedCode = selectedOption.value;
                const selectedName = selectedOption.text !== '-- Pilih --' ? selectedOption.text : '';
                input.value = selectedName;
                if (extractedData) {
                    extractedData[key] = selectedName;
                }
                // Update validation result with selected code for cascading
                if (!validationResult) validationResult = {};
                validationResult[key] = {
                    valid: !!selectedCode,
                    code: selectedCode,
                    suggestion: selectedName
                };
                select.style.display = 'none';
                input.style.display = 'block';
                // Change toggle button to checkmark if valid selection
                if (selectedCode) {
                    toggleBtn.textContent = '✓';
                    toggleBtn.classList.add('confirmed');
                    input.classList.remove('invalid-field');
                    input.classList.add('valid-field');
                } else {
                    toggleBtn.textContent = '▼';
                    toggleBtn.classList.remove('confirmed');
                }
                // Clear dependent fields and their codes
                clearDependentFields(key);
            });
            resultsBody.appendChild(row);
        }
        async function loadDropdownOptions(key, select) {
            select.innerHTML = '<option value="">Loading...</option>';
            try {
                let data = [];
                if (key === 'provinsi') {
                    if (!regionData.provinces.length) {
                        const res = await fetch('/api/provinces');
                        const json = await res.json();
                        regionData.provinces = json.data || [];
                    }
                    data = regionData.provinces;
                } else if (key === 'kabupaten_kota') {
                    const provCode = validationResult?.provinsi?.code;
                    if (provCode) {
                        if (!regionData.regencies[provCode]) {
                            const res = await fetch(`/api/regencies/${provCode}`);
                            const json = await res.json();
                            regionData.regencies[provCode] = json.data || [];
                        }
                        data = regionData.regencies[provCode];
                    }
                } else if (key === 'kecamatan') {
                    const regCode = validationResult?.kabupaten_kota?.code;
                    if (regCode) {
                        if (!regionData.districts[regCode]) {
                            const res = await fetch(`/api/districts/${regCode}`);
                            const json = await res.json();
                            regionData.districts[regCode] = json.data || [];
                        }
                        data = regionData.districts[regCode];
                    }
                } else if (key === 'kel_desa') {
                    const distCode = validationResult?.kecamatan?.code;
                    if (distCode) {
                        if (!regionData.villages[distCode]) {
                            const res = await fetch(`/api/villages/${distCode}`);
                            const json = await res.json();
                            regionData.villages[distCode] = json.data || [];
                        }
                        data = regionData.villages[distCode];
                    }
                }
                select.innerHTML = '<option value="">-- Pilih --</option>';
                data.forEach(item => {
                    const option = document.createElement('option');
                    option.value = item.code;
                    option.textContent = item.name;
                    select.appendChild(option);
                });
            } catch (e) {
                select.innerHTML = '<option value="">Error loading data</option>';
            }
        }
        function clearDependentFields(key) {
            const dependents = {
                'provinsi': ['kabupaten_kota', 'kecamatan', 'kel_desa'],
                'kabupaten_kota': ['kecamatan', 'kel_desa'],
                'kecamatan': ['kel_desa']
            };
            (dependents[key] || []).forEach(depKey => {
                const input = document.querySelector(`input[data-key="${depKey}"]`);
                if (input) input.value = '';
                if (extractedData) extractedData[depKey] = '';
                // Clear validation code for cascading
                if (validationResult && validationResult[depKey]) {
                    validationResult[depKey] = { valid: false, code: null, suggestion: null };
                }
            });
        }
        function addResultRow(label, value, key, editable = true) {
            const row = document.createElement('tr');
            if (editable && key) {
                row.innerHTML = `
                    <td class="field-label">${label}</td>
                    <td class="field-value">
                        <input type="text" class="editable-field" data-key="${key}" value="${value || ''}" placeholder="Klik untuk edit...">
                    </td>
                `;
                const input = row.querySelector('input');
                input.addEventListener('input', (e) => {
                    if (extractedData && key) {
                        extractedData[key] = e.target.value;
                    }
                });
            } else {
                row.innerHTML = `
                    <td class="field-label">${label}</td>
                    <td class="field-value">${value || '-'}</td>
                `;
            }
            resultsBody.appendChild(row);
        }
        // Toggle raw text
        document.getElementById('toggleRaw').addEventListener('click', () => {
            const isVisible = rawTextSection.style.display !== 'none';
            rawTextSection.style.display = isVisible ? 'none' : 'block';
        });
        // Copy to clipboard
        document.getElementById('copyBtn').addEventListener('click', () => {
            if (extractedData) {
                navigator.clipboard.writeText(JSON.stringify(extractedData, null, 2))
                    .then(() => alert('Data berhasil disalin!'));
            }
        });
        // Export JSON
        document.getElementById('exportBtn').addEventListener('click', () => {
            if (extractedData) {
                const blob = new Blob([JSON.stringify(extractedData, null, 2)], { type: 'application/json' });
                const url = URL.createObjectURL(blob);
                const a = document.createElement('a');
                a.href = url;
                a.download = `${docType}_data.json`;
                a.click();
                URL.revokeObjectURL(url);
            }
        });
        function showError(message) {
            errorMessage.textContent = message;
            errorSection.style.display = 'block';
        }
        function hideError() {
            errorSection.style.display = 'none';
        }
        // Reset on new file selection
        preview.addEventListener('click', () => {
            preview.style.display = 'none';
            dropzone.querySelector('.dropzone-content').style.display = 'flex';
            selectedFile = null;
            processBtn.disabled = true;
            fileInput.value = '';
        });
    </script>
 </body>
 </html>