commit 4fe381b3f0d681cb74a9e548122840fd6601cc0b Author: wwartana Date: Sun Dec 28 01:20:37 2025 +0800 OCR dengan ZONA diff --git a/__pycache__/kk_extractor.cpython-313.pyc b/__pycache__/kk_extractor.cpython-313.pyc new file mode 100644 index 0000000..96600ba Binary files /dev/null and b/__pycache__/kk_extractor.cpython-313.pyc differ diff --git a/__pycache__/ktp_extractor.cpython-313.pyc b/__pycache__/ktp_extractor.cpython-313.pyc new file mode 100644 index 0000000..f44e43c Binary files /dev/null and b/__pycache__/ktp_extractor.cpython-313.pyc differ diff --git a/__pycache__/ocr_engine.cpython-313.pyc b/__pycache__/ocr_engine.cpython-313.pyc new file mode 100644 index 0000000..c01f14b Binary files /dev/null and b/__pycache__/ocr_engine.cpython-313.pyc differ diff --git a/app.py b/app.py new file mode 100644 index 0000000..dd652fe --- /dev/null +++ b/app.py @@ -0,0 +1,253 @@ +""" +Flask Web Server untuk OCR KTP/KK +""" + +import os +from flask import Flask, render_template, request, jsonify +from werkzeug.utils import secure_filename + +from ocr_engine import get_ocr_engine +from ktp_extractor import KTPExtractor +from kk_extractor import KKExtractor + +app = Flask(__name__) + +# Konfigurasi +UPLOAD_FOLDER = os.path.join(os.path.dirname(__file__), 'uploads') +ALLOWED_EXTENSIONS = {'png', 'jpg', 'jpeg', 'bmp', 'webp'} +MAX_CONTENT_LENGTH = 16 * 1024 * 1024 # 16MB max + +app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER +app.config['MAX_CONTENT_LENGTH'] = MAX_CONTENT_LENGTH + +# Buat folder upload jika belum ada +os.makedirs(UPLOAD_FOLDER, exist_ok=True) + +# Inisialisasi extractors +ktp_extractor = KTPExtractor() +kk_extractor = KKExtractor() + + +def allowed_file(filename): + return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS + + +@app.route('/') +def index(): + """Halaman utama""" + return render_template('index.html') + + +@app.route('/upload', methods=['POST']) +def upload_file(): + """Handle upload dan proses OCR""" + try: + # Cek file + if 'file' not in request.files: + return jsonify({'success': False, 'error': 'Tidak ada file yang diupload'}), 400 + + file = request.files['file'] + doc_type = request.form.get('doc_type', 'ktp') + + if file.filename == '': + return jsonify({'success': False, 'error': 'Nama file kosong'}), 400 + + if not allowed_file(file.filename): + return jsonify({'success': False, 'error': 'Format file tidak didukung. Gunakan PNG, JPG, JPEG, BMP, atau WEBP'}), 400 + + # Simpan file + filename = secure_filename(file.filename) + filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename) + file.save(filepath) + + try: + # Jalankan OCR + ocr_engine = get_ocr_engine() + ocr_results = ocr_engine.extract_text(filepath) + + if not ocr_results: + return jsonify({ + 'success': False, + 'error': 'Tidak dapat membaca teks dari gambar. Pastikan gambar jelas dan tidak blur.' + }), 400 + + # Ekstrak field berdasarkan jenis dokumen + if doc_type == 'ktp': + extracted = ktp_extractor.extract(ocr_results) + else: + extracted = kk_extractor.extract(ocr_results) + + # Raw text untuk debugging + raw_text = '\n'.join([r['text'] for r in ocr_results]) + + # DEBUG: Print raw OCR results + print("\n" + "="*50) + print("DEBUG: Raw OCR Results") + print("="*50) + for i, r in enumerate(ocr_results): + print(f"[{i}] {r['text']}") + print("="*50 + "\n") + + return jsonify({ + 'success': True, + 'doc_type': doc_type, + 'data': extracted, + 'raw_text': raw_text, + 'ocr_count': len(ocr_results) + }) + + finally: + # Hapus file setelah proses (untuk keamanan data pribadi) + if os.path.exists(filepath): + os.remove(filepath) + + except Exception as e: + return jsonify({'success': False, 'error': str(e)}), 500 + + +# ============================================ +# Region Data API (using wilayah.id) +# ============================================ +import requests +from functools import lru_cache + +WILAYAH_API_BASE = "https://wilayah.id/api" + +@lru_cache(maxsize=100) +def fetch_region_data(endpoint): + """Fetch region data with caching""" + try: + response = requests.get(f"{WILAYAH_API_BASE}/{endpoint}", timeout=10) + if response.status_code == 200: + return response.json() + return None + except Exception as e: + print(f"Error fetching region data: {e}") + return None + + +def normalize_name(name): + """Normalize name for comparison""" + if not name: + return "" + return name.upper().strip().replace(".", "").replace(" ", "") + + +def find_best_match(search_name, items, key='name'): + """Find best matching item by name (fuzzy matching)""" + if not search_name or not items: + return None + + search_norm = normalize_name(search_name) + + # Try exact match first + for item in items: + if normalize_name(item.get(key, '')) == search_norm: + return item + + # Try contains match + for item in items: + item_norm = normalize_name(item.get(key, '')) + if search_norm in item_norm or item_norm in search_norm: + return item + + return None + + +@app.route('/api/provinces') +def get_provinces(): + """Get all provinces""" + data = fetch_region_data("provinces.json") + if data: + return jsonify(data) + return jsonify({'data': []}), 500 + + +@app.route('/api/regencies/') +def get_regencies(province_code): + """Get cities/regencies by province code""" + data = fetch_region_data(f"regencies/{province_code}.json") + if data: + return jsonify(data) + return jsonify({'data': []}), 500 + + +@app.route('/api/districts/') +def get_districts(regency_code): + """Get districts by regency code""" + data = fetch_region_data(f"districts/{regency_code}.json") + if data: + return jsonify(data) + return jsonify({'data': []}), 500 + + +@app.route('/api/villages/') +def get_villages(district_code): + """Get villages by district code""" + data = fetch_region_data(f"villages/{district_code}.json") + if data: + return jsonify(data) + return jsonify({'data': []}), 500 + + +@app.route('/api/validate-region', methods=['POST']) +def validate_region(): + """Validate OCR region data against official database""" + try: + ocr_data = request.json + result = { + 'provinsi': {'valid': False, 'code': None, 'suggestion': None}, + 'kabupaten_kota': {'valid': False, 'code': None, 'suggestion': None}, + 'kecamatan': {'valid': False, 'code': None, 'suggestion': None}, + 'kel_desa': {'valid': False, 'code': None, 'suggestion': None} + } + + # Validate province + provinces_data = fetch_region_data("provinces.json") + if provinces_data and 'data' in provinces_data: + match = find_best_match(ocr_data.get('provinsi'), provinces_data['data']) + if match: + result['provinsi'] = {'valid': True, 'code': match['code'], 'suggestion': match['name']} + + # Validate regency + regencies_data = fetch_region_data(f"regencies/{match['code']}.json") + if regencies_data and 'data' in regencies_data: + reg_match = find_best_match(ocr_data.get('kabupaten_kota'), regencies_data['data']) + if reg_match: + result['kabupaten_kota'] = {'valid': True, 'code': reg_match['code'], 'suggestion': reg_match['name']} + + # Validate district + districts_data = fetch_region_data(f"districts/{reg_match['code']}.json") + if districts_data and 'data' in districts_data: + dist_match = find_best_match(ocr_data.get('kecamatan'), districts_data['data']) + if dist_match: + result['kecamatan'] = {'valid': True, 'code': dist_match['code'], 'suggestion': dist_match['name']} + + # Validate village + villages_data = fetch_region_data(f"villages/{dist_match['code']}.json") + if villages_data and 'data' in villages_data: + vil_match = find_best_match(ocr_data.get('kel_desa'), villages_data['data']) + if vil_match: + result['kel_desa'] = {'valid': True, 'code': vil_match['code'], 'suggestion': vil_match['name']} + + return jsonify({'success': True, 'validation': result}) + + except Exception as e: + return jsonify({'success': False, 'error': str(e)}), 500 + + +@app.route('/health') +def health(): + """Health check endpoint""" + return jsonify({'status': 'ok'}) + + +if __name__ == '__main__': + print("="*50) + print("OCR KTP/KK Application") + print("="*50) + print("Membuka: http://localhost:5000") + print("Tekan Ctrl+C untuk berhenti") + print("="*50) + + app.run(host='0.0.0.0', port=5000, debug=True) diff --git a/kk.png b/kk.png new file mode 100644 index 0000000..65627be Binary files /dev/null and b/kk.png differ diff --git a/kk_extractor.py b/kk_extractor.py new file mode 100644 index 0000000..33aa912 --- /dev/null +++ b/kk_extractor.py @@ -0,0 +1,235 @@ +""" +KK (Kartu Keluarga) Field Extractor +Ekstraksi data terstruktur dari hasil OCR KK Indonesia +""" + +import re +from typing import Dict, Optional, List + + +class KKExtractor: + """Ekstrak field dari hasil OCR Kartu Keluarga""" + + def __init__(self): + pass + + def extract(self, ocr_results: List[Dict]) -> Dict: + """ + Ekstrak field KK dari hasil OCR + + Args: + ocr_results: List hasil dari OCREngine.extract_text() + + Returns: + Dict dengan field KK + """ + all_text = '\n'.join([r['text'] for r in ocr_results]) + + result = { + 'no_kk': None, + 'nama_kepala_keluarga': None, + 'alamat': None, + 'rt_rw': None, + 'kel_desa': None, + 'kecamatan': None, + 'kabupaten_kota': None, + 'provinsi': None, + 'kode_pos': None, + 'anggota_keluarga': [], + } + + # Ekstrak No KK (16 digit) + kk_match = re.search(r'\b(\d{16})\b', all_text) + if kk_match: + result['no_kk'] = kk_match.group(1) + + # Track untuk deteksi tabel anggota + in_table = False + table_start_y = None + + for i, ocr in enumerate(ocr_results): + text = ocr['text'].strip() + text_lower = text.lower() + y_pos = ocr.get('y_center', 0) + + # Provinsi + if 'provinsi' in text_lower and result['provinsi'] is None: + result['provinsi'] = self._extract_value(text, 'provinsi') + + # Kabupaten/Kota + if ('kabupaten' in text_lower or 'kota' in text_lower) and result['kabupaten_kota'] is None: + val = self._extract_value(text, 'kabupaten') or self._extract_value(text, 'kota') + if val: + result['kabupaten_kota'] = val + else: + result['kabupaten_kota'] = text + + # Kecamatan + if 'kecamatan' in text_lower and result['kecamatan'] is None: + result['kecamatan'] = self._extract_value(text, 'kecamatan') + + # Kelurahan/Desa + if ('kelurahan' in text_lower or 'desa' in text_lower) and result['kel_desa'] is None: + result['kel_desa'] = self._extract_value(text, 'kelurahan') or self._extract_value(text, 'desa') + + # No. KK dengan label + if 'no' in text_lower and ('kk' in text_lower or 'kartu' in text_lower): + # Cari 16 digit di text ini atau text berikutnya + match = re.search(r'(\d{16})', text) + if match: + result['no_kk'] = match.group(1) + elif i + 1 < len(ocr_results): + next_text = ocr_results[i + 1]['text'] + match = re.search(r'(\d{16})', next_text) + if match: + result['no_kk'] = match.group(1) + + # Nama Kepala Keluarga + if 'kepala' in text_lower and 'keluarga' in text_lower: + result['nama_kepala_keluarga'] = self._extract_value(text, 'keluarga') + if not result['nama_kepala_keluarga'] and i + 1 < len(ocr_results): + # Nama mungkin di baris berikutnya + next_text = ocr_results[i + 1]['text'].strip() + if not any(kw in next_text.lower() for kw in ['alamat', 'rt', 'rw', 'provinsi']): + result['nama_kepala_keluarga'] = next_text + + # Alamat + if 'alamat' in text_lower and result['alamat'] is None: + result['alamat'] = self._extract_value(text, 'alamat') + + # RT/RW + rt_rw_match = re.search(r'rt\s*/?\s*rw\s*[:\s]*(\d+)\s*/\s*(\d+)', text_lower) + if rt_rw_match: + result['rt_rw'] = f"{rt_rw_match.group(1)}/{rt_rw_match.group(2)}" + + # Kode Pos + if 'kode' in text_lower and 'pos' in text_lower: + match = re.search(r'(\d{5})', text) + if match: + result['kode_pos'] = match.group(1) + + # Deteksi header tabel anggota keluarga + if self._is_table_header(text_lower): + in_table = True + table_start_y = y_pos + continue + + # Ekstrak anggota keluarga dari tabel + if in_table and table_start_y: + member = self._extract_member(text, ocr_results, i) + if member: + result['anggota_keluarga'].append(member) + + # Post-processing + result = self._post_process(result) + + return result + + def _extract_value(self, text: str, field: str) -> Optional[str]: + """Ekstrak nilai setelah label field""" + patterns = [ + rf'{field}[a-z]*\s*:\s*(.+)', + rf'{field}[a-z]*\s+(.+)', + ] + + for pattern in patterns: + match = re.search(pattern, text, re.IGNORECASE) + if match: + value = match.group(1).strip() + value = re.sub(r'^[:\s]+', '', value) + if value: + return value + return None + + def _is_table_header(self, text: str) -> bool: + """Cek apakah teks adalah header tabel anggota""" + header_keywords = ['no', 'nama lengkap', 'nik', 'jenis kelamin', 'hubungan'] + count = sum(1 for kw in header_keywords if kw in text) + return count >= 2 + + def _extract_member(self, text: str, all_results: List[Dict], current_idx: int) -> Optional[Dict]: + """Ekstrak data anggota keluarga dari baris tabel""" + # Cari NIK di text + nik_match = re.search(r'\b(\d{16})\b', text) + if not nik_match: + return None + + member = { + 'nik': nik_match.group(1), + 'nama': None, + 'jenis_kelamin': None, + 'tempat_lahir': None, + 'tanggal_lahir': None, + 'hubungan': None, + } + + # Cari teks di sekitar yang mungkin nama atau info lain + text_parts = text.split() + + # Deteksi jenis kelamin + if 'laki' in text.lower() or ' l ' in f' {text.lower()} ': + member['jenis_kelamin'] = 'LAKI-LAKI' + elif 'perempuan' in text.lower() or ' p ' in f' {text.lower()} ': + member['jenis_kelamin'] = 'PEREMPUAN' + + # Deteksi hubungan keluarga + hubungan_keywords = { + 'kepala': 'KEPALA KELUARGA', + 'istri': 'ISTRI', + 'suami': 'SUAMI', + 'anak': 'ANAK', + 'menantu': 'MENANTU', + 'cucu': 'CUCU', + 'orang tua': 'ORANG TUA', + 'mertua': 'MERTUA', + } + + for keyword, value in hubungan_keywords.items(): + if keyword in text.lower(): + member['hubungan'] = value + break + + return member + + def _post_process(self, result: Dict) -> Dict: + """Post-processing hasil ekstraksi""" + # Validasi No KK + if result['no_kk'] and not re.match(r'^\d{16}$', result['no_kk']): + cleaned = re.sub(r'\D', '', result['no_kk']) + if len(cleaned) == 16: + result['no_kk'] = cleaned + else: + result['no_kk'] = None + + # Uppercase field teks + for field in ['nama_kepala_keluarga', 'alamat', 'kel_desa', 'kecamatan', + 'kabupaten_kota', 'provinsi']: + if result[field]: + result[field] = result[field].upper() + + return result + + +if __name__ == "__main__": + # Test + sample_ocr = [ + {'text': 'KARTU KELUARGA', 'y_center': 10}, + {'text': 'No. 3204012345678901', 'y_center': 30}, + {'text': 'Nama Kepala Keluarga : JOHN DOE', 'y_center': 50}, + {'text': 'Alamat : JL. MERDEKA NO. 123', 'y_center': 70}, + {'text': 'RT/RW : 001/002', 'y_center': 90}, + {'text': 'Desa/Kelurahan : SUKAMAJU', 'y_center': 110}, + {'text': 'Kecamatan : SUKASARI', 'y_center': 130}, + {'text': 'Kabupaten/Kota : BANDUNG', 'y_center': 150}, + {'text': 'Provinsi : JAWA BARAT', 'y_center': 170}, + {'text': 'Kode Pos : 40154', 'y_center': 190}, + ] + + extractor = KKExtractor() + result = extractor.extract(sample_ocr) + + for key, value in result.items(): + if key != 'anggota_keluarga': + print(f"{key}: {value}") + + print(f"\nAnggota Keluarga: {len(result['anggota_keluarga'])} orang") diff --git a/ktp.jpeg b/ktp.jpeg new file mode 100644 index 0000000..cc18b4b Binary files /dev/null and b/ktp.jpeg differ diff --git a/ktp_extractor.py b/ktp_extractor.py new file mode 100644 index 0000000..b94e146 --- /dev/null +++ b/ktp_extractor.py @@ -0,0 +1,602 @@ +""" +KTP Field Extractor +Ekstraksi data terstruktur dari hasil OCR KTP Indonesia +Mendukung berbagai format output OCR (full-width colon, standard colon, tanpa colon) +""" + +import re +from typing import Dict, Optional, List + + +class KTPExtractor: + """Ekstrak field dari hasil OCR KTP""" + + # Pattern colon yang berbeda-beda (standard, full-width, dll) + COLON_PATTERN = r'[:\:]' + + # Keywords untuk jenis kelamin + MALE_KEYWORDS = ['laki', 'pria', 'male'] + FEMALE_KEYWORDS = ['perempuan', 'wanita', 'female'] + + # Agama yang valid + AGAMA_LIST = ['islam', 'kristen', 'katolik', 'hindu', 'budha', 'buddha', 'konghucu'] + + # Pekerjaan umum + PEKERJAAN_LIST = ['pelajar', 'mahasiswa', 'pegawai', 'swasta', 'pns', 'wiraswasta', + 'buruh', 'petani', 'nelayan', 'karyawan', 'ibu rumah tangga', + 'tidak bekerja', 'lainnya', 'mengurus rumah tangga'] + + # KTP Zone Template (normalized coordinates: x_min, y_min, x_max, y_max) + # Based on standard KTP layout + ZONES = { + 'header_provinsi': (0.15, 0.00, 0.85, 0.07), # PROVINSI header + 'header_kabupaten': (0.15, 0.05, 0.85, 0.13), # KABUPATEN header + 'nik': (0.02, 0.10, 0.70, 0.22), # NIK area + 'nama': (0.02, 0.18, 0.70, 0.28), # Nama area + 'ttl': (0.02, 0.25, 0.70, 0.36), # Tempat/Tgl Lahir + 'jenis_kelamin': (0.02, 0.33, 0.45, 0.42), # Jenis Kelamin (left) + 'gol_darah': (0.40, 0.33, 0.70, 0.42), # Gol Darah (right of jenis) + 'alamat': (0.02, 0.38, 0.70, 0.50), # Alamat + 'rt_rw': (0.02, 0.46, 0.70, 0.54), # RT/RW + 'kel_desa': (0.02, 0.51, 0.70, 0.60), # Kel/Desa + 'kecamatan': (0.02, 0.57, 0.70, 0.66), # Kecamatan + 'agama': (0.02, 0.63, 0.70, 0.72), # Agama + 'status': (0.02, 0.69, 0.70, 0.78), # Status Perkawinan + 'pekerjaan': (0.02, 0.75, 0.70, 0.84), # Pekerjaan + 'wni': (0.02, 0.81, 0.70, 0.90), # Kewarganegaraan + 'berlaku': (0.02, 0.87, 0.70, 0.96), # Berlaku Hingga + 'foto': (0.68, 0.10, 0.98, 0.55), # Foto (right side) + 'penerbitan': (0.65, 0.58, 0.98, 0.98), # Tempat & Tanggal penerbitan + } + + def __init__(self): + self.image_width = 0 + self.image_height = 0 + + def _get_zone(self, x_center: float, y_center: float, img_width: int, img_height: int) -> Optional[str]: + """Determine which zone a text belongs to based on normalized coordinates""" + if img_width == 0 or img_height == 0: + return None + + # Normalize coordinates + x_norm = x_center / img_width + y_norm = y_center / img_height + + for zone_name, (x_min, y_min, x_max, y_max) in self.ZONES.items(): + if x_min <= x_norm <= x_max and y_min <= y_norm <= y_max: + return zone_name + return None + + def _extract_value_from_text(self, text: str) -> str: + """Extract value part from label:value text""" + # Split by colon (standard or full-width) + parts = re.split(r'[::]', text, 1) + if len(parts) > 1: + return parts[1].strip() + return text.strip() + + def _detect_image_size(self, ocr_results: List[Dict]) -> tuple: + """Detect image dimensions from bounding boxes""" + max_x, max_y = 0, 0 + for r in ocr_results: + bbox = r.get('bbox', []) + if bbox and len(bbox) >= 4: + for point in bbox: + if len(point) >= 2: + max_x = max(max_x, point[0]) + max_y = max(max_y, point[1]) + # Add some margin + return (int(max_x * 1.05), int(max_y * 1.05)) if max_x > 0 else (1000, 640) + + def _extract_by_zones(self, zone_texts: Dict[str, List[str]], result: Dict): + """Extract fields based on zone assignments""" + + # PROVINSI from header + if 'header_provinsi' in zone_texts: + for text in zone_texts['header_provinsi']: + if 'provinsi' in text.lower(): + val = re.sub(r'(?i)provinsi\s*', '', text).strip() + if val: + result['provinsi'] = val.upper() + break + + # KABUPATEN/KOTA from header + if 'header_kabupaten' in zone_texts: + for text in zone_texts['header_kabupaten']: + text_lower = text.lower() + if 'kabupaten' in text_lower or 'kota' in text_lower: + val = re.sub(r'(?i)(kabupaten|kota)\s*', '', text).strip() + if val: + result['kabupaten_kota'] = val.upper() + else: + result['kabupaten_kota'] = text.upper() + break + + # NAMA from nama zone (skip label line) + if 'nama' in zone_texts: + for text in zone_texts['nama']: + text_lower = text.lower() + if 'nama' not in text_lower and len(text) > 2: + result['nama'] = text.upper() + break + elif 'nama' in text_lower: + val = self._extract_value_from_text(text) + if val and 'nama' not in val.lower(): + result['nama'] = val.upper() + + # TTL from ttl zone + if 'ttl' in zone_texts: + for text in zone_texts['ttl']: + if 'tempat' in text.lower() or 'lahir' in text.lower(): + val = self._extract_value_from_text(text) + if val: + self._parse_ttl(val, result) + break + + # JENIS KELAMIN + if 'jenis_kelamin' in zone_texts: + for text in zone_texts['jenis_kelamin']: + text_lower = text.lower() + if 'laki' in text_lower: + result['jenis_kelamin'] = 'LAKI-LAKI' + break + elif 'perempuan' in text_lower: + result['jenis_kelamin'] = 'PEREMPUAN' + break + + # GOL DARAH + if 'gol_darah' in zone_texts: + for text in zone_texts['gol_darah']: + gol_match = re.search(r'([ABO]{1,2}[+\-]?)', text, re.IGNORECASE) + if gol_match: + result['gol_darah'] = gol_match.group(1).upper() + break + + # ALAMAT + if 'alamat' in zone_texts: + for text in zone_texts['alamat']: + if 'alamat' not in text.lower() or len(zone_texts['alamat']) == 1: + val = self._extract_value_from_text(text) if 'alamat' in text.lower() else text + if val and 'alamat' not in val.lower(): + result['alamat'] = val.upper() + break + + # PENERBITAN area (tempat & tanggal dalam satu zona) + if 'penerbitan' in zone_texts: + for text in zone_texts['penerbitan']: + # Look for date + date_match = re.search(r'(\d{2}[-/]\d{2}[-/]\d{4})', text) + if date_match and result['tanggal_penerbitan'] is None: + result['tanggal_penerbitan'] = date_match.group(1) + + def extract(self, ocr_results: List[Dict]) -> Dict[str, Optional[str]]: + """ + Ekstrak field KTP dari hasil OCR dengan template-based zone detection + + Args: + ocr_results: List hasil dari OCREngine.extract_text() + + Returns: + Dict dengan field KTP + """ + result = { + 'nik': None, + 'nama': None, + 'tempat_lahir': None, + 'tanggal_lahir': None, + 'jenis_kelamin': None, + 'gol_darah': None, + 'alamat': None, + 'rt_rw': None, + 'kel_desa': None, + 'kecamatan': None, + 'agama': None, + 'status_perkawinan': None, + 'pekerjaan': None, + 'kewarganegaraan': None, + 'berlaku_hingga': None, + 'provinsi': None, + 'kabupaten_kota': None, + 'tanggal_penerbitan': None, + } + + # Detect image dimensions from bounding boxes + img_width, img_height = self._detect_image_size(ocr_results) + + # Assign zones to each OCR result + zone_texts = {} # zone_name -> list of texts + for r in ocr_results: + x_center = r.get('x_center', 0) + y_center = r.get('y_center', 0) + zone = self._get_zone(x_center, y_center, img_width, img_height) + if zone: + if zone not in zone_texts: + zone_texts[zone] = [] + zone_texts[zone].append(r['text']) + + # Debug: print zone assignments + print("\n[DEBUG KTPExtractor] Zone assignments:") + for zone, texts in zone_texts.items(): + print(f" {zone}: {texts}") + + # Extract fields using zone-based approach + self._extract_by_zones(zone_texts, result) + + # Gabungkan semua teks untuk fallback pattern matching + texts = [r['text'].strip() for r in ocr_results] + all_text = '\n'.join(texts) + + # Ekstrak NIK (16 digit) - bisa ada di mana saja + nik_match = re.search(r'\b(\d{16})\b', all_text) + if nik_match: + result['nik'] = nik_match.group(1) + print(f" -> NIK found: {result['nik']}") + + # Fallback: Parse line by line for fields not found by zone + for i, text in enumerate(texts): + text_lower = text.lower() + + # Normalize colons + text_normalized = re.sub(self.COLON_PATTERN, ':', text) + text_norm_lower = text_normalized.lower() + + # ===== PROVINSI ===== + if 'provinsi' in text_lower and result['provinsi'] is None: + val = self._extract_after_label(text_normalized, 'provinsi') + if val: + result['provinsi'] = val.upper() + elif i + 1 < len(texts) and 'provinsi' not in texts[i+1].lower(): + # Mungkin value di line berikutnya + result['provinsi'] = texts[i+1].strip().upper() + + # ===== KABUPATEN/KOTA ===== + if ('kabupaten' in text_lower or 'kota' in text_lower or 'jakarta' in text_lower) and result['kabupaten_kota'] is None: + if 'provinsi' not in text_lower: # Bukan bagian dari provinsi + val = self._extract_after_label(text_normalized, 'kabupaten|kota') + if val: + result['kabupaten_kota'] = val.upper() + else: + result['kabupaten_kota'] = text.strip().upper() + + # ===== NAMA ===== + if 'nama' in text_lower and result['nama'] is None: + val = self._extract_after_label(text_normalized, 'nama') + if val and len(val) > 2: + result['nama'] = val.upper() + elif i + 1 < len(texts): + # Nama di line berikutnya + next_text = texts[i+1].strip() + if len(next_text) > 2 and not any(kw in next_text.lower() for kw in ['tempat', 'lahir', 'jenis']): + result['nama'] = next_text.upper() + + # ===== TEMPAT/TANGGAL LAHIR ===== + # Match "Tempat/Tgl Lahir" or "Tempat Lahir" or similar labels + if 'tempat' in text_lower or ('lahir' in text_lower and 'berlaku' not in text_lower): + if result['tempat_lahir'] is None or result['tanggal_lahir'] is None: + # Extract value after label using full-width or standard colon + ttl = self._extract_after_label(text_normalized, r'tempat[/\s]*tgl[/\s]*lahir|tempat[/\s]*lahir|lahir') + if ttl: + self._parse_ttl(ttl, result) + elif ':' in text or ':' in text: + # Value is after colon but _extract_after_label didn't catch it + parts = re.split(r'[::]', text, 1) + if len(parts) > 1 and parts[1].strip(): + self._parse_ttl(parts[1].strip(), result) + elif i + 1 < len(texts): + # TTL di line berikutnya + next_text = texts[i+1].strip() + if not any(kw in next_text.lower() for kw in ['jenis', 'kelamin', 'alamat', 'gol']): + self._parse_ttl(next_text, result) + + # ===== JENIS KELAMIN ===== + if any(kw in text_lower for kw in self.MALE_KEYWORDS): + if result['jenis_kelamin'] is None: + result['jenis_kelamin'] = 'LAKI-LAKI' + elif any(kw in text_lower for kw in self.FEMALE_KEYWORDS): + if result['jenis_kelamin'] is None: + result['jenis_kelamin'] = 'PEREMPUAN' + + # ===== GOLONGAN DARAH ===== + if 'darah' in text_lower or 'gol.' in text_lower: + # Try to find blood type on same line + gol_match = re.search(r'(?:gol|darah)[.\s::]*([ABO]{1,2}[+\-]?)', text, re.IGNORECASE) + if gol_match and result['gol_darah'] is None: + result['gol_darah'] = gol_match.group(1).upper() + elif result['gol_darah'] is None and i + 1 < len(texts): + # Blood type might be on next line (real KTP pattern) + next_text = texts[i+1].strip() + if re.match(r'^[ABO]{1,2}[+\-]?$', next_text, re.IGNORECASE): + result['gol_darah'] = next_text.upper() + # Standalone blood type (e.g., just "O" or "A+" on its own line) + if result['gol_darah'] is None: + if re.match(r'^[ABO]{1,2}[+\-]?$', text.strip(), re.IGNORECASE) and len(text.strip()) <= 3: + result['gol_darah'] = text.strip().upper() + + # ===== ALAMAT ===== + if 'alamat' in text_lower and result['alamat'] is None: + val = self._extract_after_label(text_normalized, 'alamat') + if val: + result['alamat'] = val.upper() + elif i + 1 < len(texts): + result['alamat'] = texts[i+1].strip().upper() + + # ===== RT/RW ===== + rt_rw_match = re.search(r'(\d{3})\s*/\s*(\d{3})', text) + if rt_rw_match: + result['rt_rw'] = f"{rt_rw_match.group(1)}/{rt_rw_match.group(2)}" + + # ===== KELURAHAN/DESA ===== + if ('kel' in text_lower or 'desa' in text_lower) and 'kelamin' not in text_lower: + if result['kel_desa'] is None: + val = self._extract_after_label(text_normalized, 'kel|desa') + if val: + result['kel_desa'] = val.upper() + elif i + 1 < len(texts): + result['kel_desa'] = texts[i+1].strip().upper() + + # ===== KECAMATAN ===== + if 'kecamatan' in text_lower or ('kec' in text_lower and 'kelamin' not in text_lower): + if result['kecamatan'] is None: + val = self._extract_after_label(text_normalized, 'kecamatan|kec') + if val: + result['kecamatan'] = val.upper() + elif i + 1 < len(texts): + # Value on next line (real KTP pattern) + next_text = texts[i+1].strip() + if len(next_text) > 2 and not any(kw in next_text.lower() for kw in ['agama', 'status', 'pekerjaan']): + result['kecamatan'] = next_text.upper() + + # ===== AGAMA ===== + if 'agama' in text_lower: + val = self._extract_after_label(text_normalized, 'agama') + if val and result['agama'] is None: + result['agama'] = val.upper() + elif result['agama'] is None and i + 1 < len(texts): + # Value on next line (real KTP pattern) + next_text = texts[i+1].strip().upper() + if next_text in ['ISLAM', 'KRISTEN', 'KATOLIK', 'HINDU', 'BUDHA', 'BUDDHA', 'KONGHUCU']: + result['agama'] = next_text + else: + # Check if line contains only agama name + for agama in self.AGAMA_LIST: + if agama in text_lower and len(text) < 20: + if result['agama'] is None: + result['agama'] = text.strip().upper() + break + + # ===== STATUS PERKAWINAN ===== + if 'kawin' in text_lower: + if result['status_perkawinan'] is None: + val = self._extract_after_label(text_normalized, 'status.*kawin|perkawinan') + if val: + result['status_perkawinan'] = val.upper() + elif 'belum' in text_lower: + result['status_perkawinan'] = 'BELUM KAWIN' + elif 'kawin' in text_lower and 'cerai' not in text_lower: + result['status_perkawinan'] = 'KAWIN' + elif 'cerai hidup' in text_lower: + result['status_perkawinan'] = 'CERAI HIDUP' + elif 'cerai mati' in text_lower: + result['status_perkawinan'] = 'CERAI MATI' + + # ===== PEKERJAAN ===== + if 'pekerjaan' in text_lower: + val = self._extract_after_label(text_normalized, 'pekerjaan') + if val and result['pekerjaan'] is None: + result['pekerjaan'] = val.upper() + elif result['pekerjaan'] is None and i + 1 < len(texts): + # Value on next line (real KTP pattern) + next_text = texts[i+1].strip() + if len(next_text) > 2 and 'kewarganegaraan' not in next_text.lower(): + result['pekerjaan'] = next_text.upper() + else: + # Check if line contains pekerjaan keyword + for pekerjaan in self.PEKERJAAN_LIST: + if pekerjaan in text_lower and len(text) < 30: + if result['pekerjaan'] is None: + result['pekerjaan'] = text.strip().upper() + break + + # ===== KEWARGANEGARAAN ===== + if 'wni' in text_lower: + result['kewarganegaraan'] = 'WNI' + elif 'wna' in text_lower: + result['kewarganegaraan'] = 'WNA' + elif 'warga' in text_lower and result['kewarganegaraan'] is None: + val = self._extract_after_label(text_normalized, 'kewarganegaraan|warga') + if val: + result['kewarganegaraan'] = val.upper() + + # ===== BERLAKU HINGGA ===== + if 'berlaku' in text_lower or 'seumur' in text_lower: + if result['berlaku_hingga'] is None: + if 'seumur' in text_lower or 'hidup' in text_lower: + result['berlaku_hingga'] = 'SEUMUR HIDUP' + else: + val = self._extract_after_label(text_normalized, 'berlaku') + if val: + result['berlaku_hingga'] = val.upper() + + # ===== TANGGAL PENERBITAN (biasanya format DD-MM-YYYY di akhir) ===== + # Look for date that is NOT tanggal lahir (different date) + if result['tanggal_penerbitan'] is None: + # Match date format at end of text or standalone date + date_match = re.search(r'(\d{2}[-/]\d{2}[-/]\d{4})$', text.strip()) + if date_match: + found_date = date_match.group(1) + # Make sure it's not the same as tanggal_lahir + if result['tanggal_lahir'] != found_date: + # Likely penerbitan if after berlaku_hingga was found + if result['berlaku_hingga'] or i > len(texts) * 0.7: + result['tanggal_penerbitan'] = found_date + + # Post-processing + result = self._post_process(result) + + return result + + def _extract_after_label(self, text: str, label_pattern: str) -> Optional[str]: + """Ekstrak nilai setelah label (supports various separators)""" + patterns = [ + rf'(?:{label_pattern})\s*:\s*(.+)', # label: value + rf'(?:{label_pattern})\s+([A-Z0-9].+)', # label VALUE (uppercase start) + ] + + for pattern in patterns: + match = re.search(pattern, text, re.IGNORECASE) + if match: + value = match.group(1).strip() + # Remove trailing colon or label fragment + value = re.sub(r'^[:\s]+', '', value) + value = re.sub(r'\s*:\s*$', '', value) + if value and len(value) > 1: + return value + + return None + + def _parse_ttl(self, ttl_text: str, result: Dict): + """Parse tempat/tanggal lahir dari text""" + ttl_text = ttl_text.strip() + + # Normalize dates where OCR missed dashes: + # "05 08 1978" -> "05-08-1978" + # "05 08-1978" -> "05-08-1978" + # "05-08 1978" -> "05-08-1978" + ttl_text = re.sub(r'(\d{2})[\s]+(\d{2})[\s]+(\d{4})', r'\1-\2-\3', ttl_text) + ttl_text = re.sub(r'(\d{2})[\s]+(\d{2})[-/](\d{4})', r'\1-\2-\3', ttl_text) + ttl_text = re.sub(r'(\d{2})[-/](\d{2})[\s]+(\d{4})', r'\1-\2-\3', ttl_text) + + # Handle 8-digit date without separator: "05081978" -> "05-08-1978" + date_8digit = re.search(r'(\d{8})', ttl_text) + if date_8digit: + d = date_8digit.group(1) + formatted = f"{d[:2]}-{d[2:4]}-{d[4:]}" + ttl_text = ttl_text.replace(d, formatted) + + # Handle merged city+date like "JAKARTA05-08-1978" - add space before digits + ttl_text = re.sub(r'([A-Z])(\d{2}[-/])', r'\1 \2', ttl_text, flags=re.IGNORECASE) + + # Format: "TEMPAT, DD-MM-YYYY" atau "TEMPAT DD-MM-YYYY" + date_match = re.search(r'(\d{2}[-/]\d{2}[-/]\d{4})', ttl_text) + if date_match: + result['tanggal_lahir'] = date_match.group(1) + # Tempat adalah bagian sebelum tanggal + place = ttl_text[:date_match.start()].strip(' ,:-/') + # Clean up label remnants + place = re.sub(r'^(tempat|tgl|lahir|:|:)[/\s::]*', '', place, flags=re.IGNORECASE).strip() + if place and len(place) > 2: + result['tempat_lahir'] = place.upper() + else: + # Coba split by comma + parts = ttl_text.split(',') + if len(parts) >= 2: + result['tempat_lahir'] = parts[0].strip().upper() + result['tanggal_lahir'] = parts[1].strip() + elif len(parts) == 1 and len(ttl_text) > 2: + result['tempat_lahir'] = ttl_text.upper() + + def _post_process(self, result: Dict) -> Dict: + """Post-processing hasil ekstraksi""" + # Validasi NIK (harus 16 digit) + if result['nik'] and not re.match(r'^\d{16}$', result['nik']): + cleaned = re.sub(r'\D', '', result['nik']) + if len(cleaned) == 16: + result['nik'] = cleaned + else: + result['nik'] = None + + # Clean all string values - remove leading colons and extra whitespace + for field in result: + if result[field] and isinstance(result[field], str): + val = result[field] + # Remove leading colons (standard and full-width) + val = re.sub(r'^[\s::]+', '', val) + # Remove trailing colons + val = re.sub(r'[\s::]+$', '', val) + # Remove double spaces + val = re.sub(r'\s+', ' ', val) + result[field] = val.strip() + + # Bersihkan label dari values + for field in ['nama', 'alamat', 'tempat_lahir', 'kel_desa', 'kecamatan', 'agama', 'pekerjaan']: + if result[field]: + # Remove common labels yang ter-capture + result[field] = re.sub( + r'^(NAMA|ALAMAT|TEMPAT|LAHIR|TGL|KEL|DESA|KELURAHAN|KECAMATAN|KEC|AGAMA|PEKERJAAN|STATUS)[\s::]*', + '', result[field], flags=re.IGNORECASE + ).strip() + + # Fix status perkawinan yang masih mengandung label + if result['status_perkawinan']: + sp = result['status_perkawinan'] + sp = re.sub(r'^(STATUS|PERKAWINAN)[\s::]*', '', sp, flags=re.IGNORECASE).strip() + result['status_perkawinan'] = sp + + # Fix berlaku hingga + if result['berlaku_hingga']: + bh = result['berlaku_hingga'] + bh = re.sub(r'^(BERLAKU|HINGGA)[\s::]*', '', bh, flags=re.IGNORECASE).strip() + if bh.upper() == 'HIDUP' or 'SEUMUR' in bh.upper(): + result['berlaku_hingga'] = 'SEUMUR HIDUP' + else: + result['berlaku_hingga'] = bh + + # Fix merged kabupaten/kota names (e.g., JAKARTASELATAN -> JAKARTA SELATAN) + if result['kabupaten_kota']: + kk = result['kabupaten_kota'] + # Add space before directional words + kk = re.sub(r'(JAKARTA|BANDUNG|SURABAYA|SEMARANG|MEDAN|BEKASI|TANGERANG|DEPOK|BOGOR)(SELATAN|UTARA|BARAT|TIMUR|PUSAT|TENGAH)', + r'\1 \2', kk, flags=re.IGNORECASE) + # Common merged patterns + kk = re.sub(r'(KOTA|KABUPATEN)([A-Z])', r'\1 \2', kk, flags=re.IGNORECASE) + result['kabupaten_kota'] = kk.upper() + + # Fix merged provinsi names + if result['provinsi']: + prov = result['provinsi'] + prov = re.sub(r'(DKI|DI)(JAKARTA|YOGYAKARTA)', r'\1 \2', prov, flags=re.IGNORECASE) + prov = re.sub(r'(JAWA|KALIMANTAN|SULAWESI|SUMATERA|NUSA TENGGARA)(BARAT|TIMUR|TENGAH|SELATAN|UTARA)', + r'\1 \2', prov, flags=re.IGNORECASE) + result['provinsi'] = prov.upper() + + # Fix merged alamat/address (e.g., JLKECAPIV -> JL KECAPI V) + if result['alamat']: + alamat = result['alamat'] + # Add space after common street prefixes + alamat = re.sub(r'^(JL|JLN|JALAN|GG|GANG|NO|BLOK)([A-Z])', r'\1 \2', alamat, flags=re.IGNORECASE) + # Add space before Roman numerals at the end (I, II, III, IV, V, VI, VII, VIII, IX, X) + alamat = re.sub(r'([A-Z])([IVX]+)$', r'\1 \2', alamat, flags=re.IGNORECASE) + # Add space before single digits/numbers at end + alamat = re.sub(r'([A-Z])(\d+)$', r'\1 \2', alamat, flags=re.IGNORECASE) + # Fix common patterns: "NO123" -> "NO 123", "BLOKA" -> "BLOK A" + alamat = re.sub(r'\b(NO|BLOK)(\d+|[A-Z])\b', r'\1 \2', alamat, flags=re.IGNORECASE) + result['alamat'] = alamat.upper() + + return result + + +if __name__ == "__main__": + # Test + sample_ocr = [ + {'text': 'PROVINSI JAWA BARAT'}, + {'text': 'KABUPATEN BANDUNG'}, + {'text': 'NIK : 3204012345678901'}, + {'text': 'Nama : JOHN DOE'}, + {'text': 'Tempat/Tgl Lahir : BANDUNG, 01-01-1990'}, + {'text': 'Jenis Kelamin : LAKI-LAKI'}, + {'text': 'Alamat : JL. MERDEKA NO. 123'}, + {'text': 'RT/RW : 001/002'}, + {'text': 'Kel/Desa : SUKAMAJU'}, + {'text': 'Kecamatan : SUKASARI'}, + {'text': 'Agama : ISLAM'}, + {'text': 'Status Perkawinan : BELUM KAWIN'}, + {'text': 'Pekerjaan : KARYAWAN SWASTA'}, + {'text': 'Kewarganegaraan : WNI'}, + {'text': 'Berlaku Hingga : SEUMUR HIDUP'}, + ] + + extractor = KTPExtractor() + result = extractor.extract(sample_ocr) + + for key, value in result.items(): + print(f"{key}: {value}") diff --git a/ocr_engine.py b/ocr_engine.py new file mode 100644 index 0000000..ba998c3 --- /dev/null +++ b/ocr_engine.py @@ -0,0 +1,153 @@ +""" +OCR Engine menggunakan PaddleOCR 3.x +Untuk membaca teks dari gambar dokumen Indonesia (KTP, KK) +""" + +from paddleocr import PaddleOCR +import cv2 +import numpy as np +from PIL import Image + + +class OCREngine: + def __init__(self): + """Inisialisasi PaddleOCR 3.x dengan konfigurasi untuk dokumen Indonesia""" + self.ocr = PaddleOCR( + use_doc_orientation_classify=True, # Deteksi rotasi (0°/90°/180°/270°) + use_doc_unwarping=True, # Koreksi perspektif (trapezium → persegi) + use_textline_orientation=True, # Orientasi per baris teks + ) + + def preprocess_image(self, image_path: str) -> np.ndarray: + """ + Preprocessing gambar untuk hasil OCR lebih baik + - Resize jika terlalu besar + - Enhance contrast + """ + img = cv2.imread(image_path) + if img is None: + raise ValueError(f"Tidak dapat membaca gambar: {image_path}") + + # Resize jika terlalu besar (max 2000px) + max_dim = 2000 + height, width = img.shape[:2] + if max(height, width) > max_dim: + scale = max_dim / max(height, width) + img = cv2.resize(img, None, fx=scale, fy=scale, interpolation=cv2.INTER_AREA) + + # Convert ke grayscale untuk preprocessing + gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + + # Enhance contrast menggunakan CLAHE + clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8)) + enhanced = clahe.apply(gray) + + # Convert kembali ke BGR untuk PaddleOCR + enhanced_bgr = cv2.cvtColor(enhanced, cv2.COLOR_GRAY2BGR) + + return enhanced_bgr + + def extract_text(self, image_path: str, preprocess: bool = False) -> list: + """ + Ekstraksi teks dari gambar menggunakan PaddleOCR 3.x API + + Args: + image_path: Path ke file gambar + preprocess: Apakah melakukan preprocessing + + Returns: + List of dict dengan keys: 'text', 'confidence', 'bbox' + """ + try: + # Jalankan OCR dengan API baru (predict) + result = self.ocr.predict(input=image_path) + + if not result: + return [] + + extracted = [] + + # Parse hasil dari PaddleOCR 3.x + for res in result: + # Akses data dari result object + if hasattr(res, 'rec_texts') and hasattr(res, 'rec_scores') and hasattr(res, 'dt_polys'): + texts = res.rec_texts if res.rec_texts else [] + scores = res.rec_scores if res.rec_scores else [] + polys = res.dt_polys if res.dt_polys else [] + + for i, text in enumerate(texts): + confidence = scores[i] if i < len(scores) else 0.0 + bbox = polys[i].tolist() if i < len(polys) and hasattr(polys[i], 'tolist') else [] + + # Calculate center for sorting + if bbox and len(bbox) >= 4: + y_center = (bbox[0][1] + bbox[2][1]) / 2 + x_center = (bbox[0][0] + bbox[2][0]) / 2 + else: + y_center = 0 + x_center = 0 + + extracted.append({ + 'text': text, + 'confidence': float(confidence), + 'bbox': bbox, + 'y_center': y_center, + 'x_center': x_center, + }) + # Fallback: try dict-like access + elif hasattr(res, '__getitem__'): + try: + texts = res.get('rec_texts', res.get('texts', [])) + scores = res.get('rec_scores', res.get('scores', [])) + + for i, text in enumerate(texts): + confidence = scores[i] if i < len(scores) else 0.0 + extracted.append({ + 'text': text, + 'confidence': float(confidence), + 'bbox': [], + 'y_center': i * 10, # Simple ordering fallback + 'x_center': 0, + }) + except Exception: + pass + + # Sort berdasarkan posisi Y (atas ke bawah) + if extracted: + extracted.sort(key=lambda x: (x['y_center'], x['x_center'])) + + return extracted + + except Exception as e: + print(f"Error OCR: {e}") + import traceback + traceback.print_exc() + return [] + + def get_raw_text(self, image_path: str) -> str: + """ + Mendapatkan semua teks dari gambar sebagai string + """ + results = self.extract_text(image_path) + return '\n'.join([r['text'] for r in results]) + + +# Singleton instance +_ocr_engine = None + +def get_ocr_engine() -> OCREngine: + """Get singleton OCR engine instance""" + global _ocr_engine + if _ocr_engine is None: + _ocr_engine = OCREngine() + return _ocr_engine + + +if __name__ == "__main__": + # Test OCR + import sys + if len(sys.argv) > 1: + engine = get_ocr_engine() + results = engine.extract_text(sys.argv[1]) + for r in results: + print(f"[{r['confidence']:.2f}] {r['text']}") diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..deb2a32 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +paddlepaddle +paddleocr +flask +pillow +opencv-python diff --git a/static/style.css b/static/style.css new file mode 100644 index 0000000..6e7d67e --- /dev/null +++ b/static/style.css @@ -0,0 +1,538 @@ +/* OCR KTP/KK - Modern Dark Theme */ + +:root { + --bg-primary: #0f0f1a; + --bg-secondary: #1a1a2e; + --bg-tertiary: #252540; + --accent-primary: #6366f1; + --accent-secondary: #818cf8; + --accent-gradient: linear-gradient(135deg, #6366f1 0%, #a855f7 100%); + --text-primary: #f1f5f9; + --text-secondary: #94a3b8; + --text-muted: #64748b; + --success: #22c55e; + --error: #ef4444; + --warning: #f59e0b; + --border: #334155; + --shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.3); + --shadow-lg: 0 10px 15px -3px rgba(0, 0, 0, 0.4); + --radius: 12px; + --radius-lg: 16px; +} + +* { + margin: 0; + padding: 0; + box-sizing: border-box; +} + +body { + font-family: 'Segoe UI', system-ui, -apple-system, sans-serif; + background: var(--bg-primary); + color: var(--text-primary); + min-height: 100vh; + line-height: 1.6; +} + +.container { + max-width: 800px; + margin: 0 auto; + padding: 2rem 1rem; +} + +/* Header */ +header { + text-align: center; + margin-bottom: 2rem; +} + +header h1 { + font-size: 2.5rem; + font-weight: 700; + background: var(--accent-gradient); + -webkit-background-clip: text; + -webkit-text-fill-color: transparent; + background-clip: text; + margin-bottom: 0.5rem; +} + +.subtitle { + color: var(--text-secondary); + font-size: 1.1rem; +} + +/* Upload Section */ +.upload-section { + background: var(--bg-secondary); + border-radius: var(--radius-lg); + padding: 2rem; + box-shadow: var(--shadow-lg); + margin-bottom: 2rem; +} + +/* Document Type Selector */ +.doc-type-selector { + display: flex; + gap: 1rem; + margin-bottom: 1.5rem; +} + +.doc-btn { + flex: 1; + display: flex; + align-items: center; + justify-content: center; + gap: 0.5rem; + padding: 1rem; + background: var(--bg-tertiary); + border: 2px solid transparent; + border-radius: var(--radius); + color: var(--text-secondary); + font-size: 1rem; + font-weight: 600; + cursor: pointer; + transition: all 0.3s ease; +} + +.doc-btn:hover { + background: var(--bg-primary); + color: var(--text-primary); +} + +.doc-btn.active { + background: var(--accent-gradient); + color: white; + border-color: var(--accent-secondary); +} + +.doc-btn .icon { + font-size: 1.5rem; +} + +/* Dropzone */ +.dropzone { + border: 2px dashed var(--border); + border-radius: var(--radius); + padding: 3rem 2rem; + text-align: center; + cursor: pointer; + transition: all 0.3s ease; + background: var(--bg-tertiary); + position: relative; + overflow: hidden; +} + +.dropzone:hover, +.dropzone.dragover { + border-color: var(--accent-primary); + background: rgba(99, 102, 241, 0.1); +} + +.dropzone-content { + display: flex; + flex-direction: column; + align-items: center; + gap: 0.5rem; +} + +.upload-icon { + font-size: 4rem; + margin-bottom: 0.5rem; +} + +.dropzone p { + color: var(--text-secondary); +} + +.dropzone .hint { + color: var(--text-muted); + font-size: 0.875rem; +} + +.file-btn { + display: inline-block; + padding: 0.75rem 1.5rem; + background: var(--accent-gradient); + color: white; + border-radius: var(--radius); + font-weight: 600; + cursor: pointer; + margin: 0.5rem 0; + transition: transform 0.2s ease; +} + +.file-btn:hover { + transform: scale(1.05); +} + +.file-types { + font-size: 0.75rem; + color: var(--text-muted); +} + +.preview-image { + max-width: 100%; + max-height: 400px; + border-radius: var(--radius); + cursor: pointer; +} + +/* Process Button */ +.process-btn { + width: 100%; + padding: 1rem; + margin-top: 1.5rem; + background: var(--accent-gradient); + border: none; + border-radius: var(--radius); + color: white; + font-size: 1.1rem; + font-weight: 600; + cursor: pointer; + transition: all 0.3s ease; + box-shadow: var(--shadow); +} + +.process-btn:hover:not(:disabled) { + transform: translateY(-2px); + box-shadow: var(--shadow-lg); +} + +.process-btn:disabled { + opacity: 0.5; + cursor: not-allowed; +} + +/* Results Section */ +.results-section { + background: var(--bg-secondary); + border-radius: var(--radius-lg); + padding: 2rem; + box-shadow: var(--shadow-lg); + animation: slideUp 0.3s ease; +} + +@keyframes slideUp { + from { + opacity: 0; + transform: translateY(20px); + } + + to { + opacity: 1; + transform: translateY(0); + } +} + +.results-header { + display: flex; + justify-content: space-between; + align-items: center; + margin-bottom: 1.5rem; + flex-wrap: wrap; + gap: 1rem; +} + +.results-header h2 { + font-size: 1.5rem; +} + +.results-actions { + display: flex; + gap: 0.5rem; +} + +.action-btn { + padding: 0.5rem 1rem; + background: var(--bg-tertiary); + border: 1px solid var(--border); + border-radius: var(--radius); + color: var(--text-primary); + font-size: 0.875rem; + cursor: pointer; + transition: all 0.2s ease; +} + +.action-btn:hover { + background: var(--accent-primary); + border-color: var(--accent-primary); +} + +.action-btn.secondary { + background: transparent; +} + +/* Results Table */ +.results-table { + width: 100%; + border-collapse: collapse; +} + +.results-table th, +.results-table td { + padding: 0.875rem 1rem; + text-align: left; + border-bottom: 1px solid var(--border); +} + +.results-table th { + background: var(--bg-tertiary); + color: var(--text-secondary); + font-weight: 600; + font-size: 0.875rem; + text-transform: uppercase; + letter-spacing: 0.05em; +} + +.results-table th:first-child { + border-radius: var(--radius) 0 0 0; +} + +.results-table th:last-child { + border-radius: 0 var(--radius) 0 0; +} + +.field-label { + color: var(--text-secondary); + font-weight: 500; + width: 40%; +} + +.field-value { + color: var(--text-primary); + font-weight: 600; +} + +.results-table tr:hover { + background: rgba(99, 102, 241, 0.05); +} + +/* Editable Fields */ +.editable-field { + width: 100%; + padding: 0.5rem 0.75rem; + background: var(--bg-tertiary); + border: 1px solid var(--border); + border-radius: 6px; + color: var(--text-primary); + font-size: 0.95rem; + font-weight: 600; + font-family: inherit; + transition: all 0.2s ease; +} + +.editable-field:focus { + outline: none; + border-color: var(--accent-primary); + background: var(--bg-secondary); + box-shadow: 0 0 0 3px rgba(99, 102, 241, 0.2); +} + +.editable-field::placeholder { + color: var(--text-muted); + font-weight: 400; +} + +/* Region Dropdown Styles */ +.region-field-wrapper { + display: flex; + gap: 0.5rem; + align-items: center; +} + +.region-field-wrapper input, +.region-field-wrapper select { + flex: 1; +} + +.region-dropdown { + width: 100%; + padding: 0.5rem 0.75rem; + background: var(--bg-tertiary); + border: 1px solid var(--border); + border-radius: 6px; + color: var(--text-primary); + font-size: 0.95rem; + font-family: inherit; + cursor: pointer; +} + +.region-dropdown:focus { + outline: none; + border-color: var(--accent-primary); +} + +.dropdown-toggle { + padding: 0.5rem 0.75rem; + background: var(--bg-tertiary); + border: 1px solid var(--border); + border-radius: 6px; + color: var(--text-secondary); + cursor: pointer; + transition: all 0.2s ease; + flex-shrink: 0; +} + +.dropdown-toggle:hover { + background: var(--accent-primary); + color: white; +} + +.dropdown-toggle.confirmed { + background: var(--success); + color: white; + border-color: var(--success); +} + +/* Validation Indicators */ +.validation-status { + margin-left: 0.5rem; + font-size: 0.875rem; +} + +.validation-status.valid-field { + color: var(--success); +} + +.validation-status.invalid-field { + color: var(--warning); +} + +.editable-field.valid-field { + border-color: var(--success); +} + +.editable-field.invalid-field { + border-color: var(--warning); +} + +.suggestion-text { + font-size: 0.75rem; + color: var(--text-muted); + margin-top: 0.25rem; + font-style: italic; +} + +/* Raw Text Section */ +.raw-text-section { + margin-top: 1.5rem; + padding-top: 1.5rem; + border-top: 1px solid var(--border); +} + +.raw-text-section h3 { + font-size: 1rem; + color: var(--text-secondary); + margin-bottom: 1rem; +} + +.raw-text-section pre { + background: var(--bg-primary); + padding: 1rem; + border-radius: var(--radius); + font-family: 'Consolas', monospace; + font-size: 0.875rem; + color: var(--text-secondary); + white-space: pre-wrap; + word-wrap: break-word; + max-height: 300px; + overflow-y: auto; +} + +/* Error Section */ +.error-section { + margin-top: 1rem; +} + +.error-content { + background: rgba(239, 68, 68, 0.1); + border: 1px solid var(--error); + border-radius: var(--radius); + padding: 1rem; + display: flex; + align-items: center; + gap: 0.75rem; +} + +.error-icon { + font-size: 1.5rem; +} + +.error-content p { + color: var(--error); +} + +/* Footer */ +footer { + text-align: center; + margin-top: 2rem; + padding-top: 1rem; + border-top: 1px solid var(--border); +} + +footer p { + color: var(--text-muted); + font-size: 0.875rem; +} + +footer a { + color: var(--accent-secondary); + text-decoration: none; +} + +footer a:hover { + text-decoration: underline; +} + +/* Responsive */ +@media (max-width: 600px) { + .container { + padding: 1rem; + } + + header h1 { + font-size: 2rem; + } + + .upload-section, + .results-section { + padding: 1.5rem; + } + + .doc-type-selector { + flex-direction: column; + } + + .results-header { + flex-direction: column; + align-items: flex-start; + } + + .results-actions { + width: 100%; + justify-content: flex-start; + } + + .field-label { + width: 45%; + } +} + +/* Scrollbar */ +::-webkit-scrollbar { + width: 8px; + height: 8px; +} + +::-webkit-scrollbar-track { + background: var(--bg-tertiary); +} + +::-webkit-scrollbar-thumb { + background: var(--border); + border-radius: 4px; +} + +::-webkit-scrollbar-thumb:hover { + background: var(--text-muted); +} \ No newline at end of file diff --git a/templates/index.html b/templates/index.html new file mode 100644 index 0000000..7f70e90 --- /dev/null +++ b/templates/index.html @@ -0,0 +1,570 @@ + + + + + + + OCR KTP/KK - Pembaca Dokumen Indonesia + + + + +
+
+

📄 OCR KTP/KK

+

Pembaca Dokumen Indonesia Offline

+
+ +
+ +
+
+ + +
+ +
+
+
📷
+

Drag & drop gambar di sini

+

atau

+ +

PNG, JPG, JPEG, BMP, WEBP (max 16MB)

+
+ +
+ + +
+ + + + + + +
+ +
+

OCR menggunakan PaddleOCR • Data + diproses secara lokal

+
+
+ + + + + \ No newline at end of file