OCR dengan ZONA

This commit is contained in:
2025-12-28 01:20:37 +08:00
commit 4fe381b3f0
12 changed files with 2356 additions and 0 deletions

Binary file not shown.

Binary file not shown.

Binary file not shown.

253
app.py Normal file
View File

@@ -0,0 +1,253 @@
"""
Flask Web Server untuk OCR KTP/KK
"""
import os
from flask import Flask, render_template, request, jsonify
from werkzeug.utils import secure_filename
from ocr_engine import get_ocr_engine
from ktp_extractor import KTPExtractor
from kk_extractor import KKExtractor
app = Flask(__name__)
# Konfigurasi
UPLOAD_FOLDER = os.path.join(os.path.dirname(__file__), 'uploads')
ALLOWED_EXTENSIONS = {'png', 'jpg', 'jpeg', 'bmp', 'webp'}
MAX_CONTENT_LENGTH = 16 * 1024 * 1024 # 16MB max
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
app.config['MAX_CONTENT_LENGTH'] = MAX_CONTENT_LENGTH
# Buat folder upload jika belum ada
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
# Inisialisasi extractors
ktp_extractor = KTPExtractor()
kk_extractor = KKExtractor()
def allowed_file(filename):
return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
@app.route('/')
def index():
"""Halaman utama"""
return render_template('index.html')
@app.route('/upload', methods=['POST'])
def upload_file():
"""Handle upload dan proses OCR"""
try:
# Cek file
if 'file' not in request.files:
return jsonify({'success': False, 'error': 'Tidak ada file yang diupload'}), 400
file = request.files['file']
doc_type = request.form.get('doc_type', 'ktp')
if file.filename == '':
return jsonify({'success': False, 'error': 'Nama file kosong'}), 400
if not allowed_file(file.filename):
return jsonify({'success': False, 'error': 'Format file tidak didukung. Gunakan PNG, JPG, JPEG, BMP, atau WEBP'}), 400
# Simpan file
filename = secure_filename(file.filename)
filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
file.save(filepath)
try:
# Jalankan OCR
ocr_engine = get_ocr_engine()
ocr_results = ocr_engine.extract_text(filepath)
if not ocr_results:
return jsonify({
'success': False,
'error': 'Tidak dapat membaca teks dari gambar. Pastikan gambar jelas dan tidak blur.'
}), 400
# Ekstrak field berdasarkan jenis dokumen
if doc_type == 'ktp':
extracted = ktp_extractor.extract(ocr_results)
else:
extracted = kk_extractor.extract(ocr_results)
# Raw text untuk debugging
raw_text = '\n'.join([r['text'] for r in ocr_results])
# DEBUG: Print raw OCR results
print("\n" + "="*50)
print("DEBUG: Raw OCR Results")
print("="*50)
for i, r in enumerate(ocr_results):
print(f"[{i}] {r['text']}")
print("="*50 + "\n")
return jsonify({
'success': True,
'doc_type': doc_type,
'data': extracted,
'raw_text': raw_text,
'ocr_count': len(ocr_results)
})
finally:
# Hapus file setelah proses (untuk keamanan data pribadi)
if os.path.exists(filepath):
os.remove(filepath)
except Exception as e:
return jsonify({'success': False, 'error': str(e)}), 500
# ============================================
# Region Data API (using wilayah.id)
# ============================================
import requests
from functools import lru_cache
WILAYAH_API_BASE = "https://wilayah.id/api"
@lru_cache(maxsize=100)
def fetch_region_data(endpoint):
"""Fetch region data with caching"""
try:
response = requests.get(f"{WILAYAH_API_BASE}/{endpoint}", timeout=10)
if response.status_code == 200:
return response.json()
return None
except Exception as e:
print(f"Error fetching region data: {e}")
return None
def normalize_name(name):
"""Normalize name for comparison"""
if not name:
return ""
return name.upper().strip().replace(".", "").replace(" ", "")
def find_best_match(search_name, items, key='name'):
"""Find best matching item by name (fuzzy matching)"""
if not search_name or not items:
return None
search_norm = normalize_name(search_name)
# Try exact match first
for item in items:
if normalize_name(item.get(key, '')) == search_norm:
return item
# Try contains match
for item in items:
item_norm = normalize_name(item.get(key, ''))
if search_norm in item_norm or item_norm in search_norm:
return item
return None
@app.route('/api/provinces')
def get_provinces():
"""Get all provinces"""
data = fetch_region_data("provinces.json")
if data:
return jsonify(data)
return jsonify({'data': []}), 500
@app.route('/api/regencies/<province_code>')
def get_regencies(province_code):
"""Get cities/regencies by province code"""
data = fetch_region_data(f"regencies/{province_code}.json")
if data:
return jsonify(data)
return jsonify({'data': []}), 500
@app.route('/api/districts/<regency_code>')
def get_districts(regency_code):
"""Get districts by regency code"""
data = fetch_region_data(f"districts/{regency_code}.json")
if data:
return jsonify(data)
return jsonify({'data': []}), 500
@app.route('/api/villages/<district_code>')
def get_villages(district_code):
"""Get villages by district code"""
data = fetch_region_data(f"villages/{district_code}.json")
if data:
return jsonify(data)
return jsonify({'data': []}), 500
@app.route('/api/validate-region', methods=['POST'])
def validate_region():
"""Validate OCR region data against official database"""
try:
ocr_data = request.json
result = {
'provinsi': {'valid': False, 'code': None, 'suggestion': None},
'kabupaten_kota': {'valid': False, 'code': None, 'suggestion': None},
'kecamatan': {'valid': False, 'code': None, 'suggestion': None},
'kel_desa': {'valid': False, 'code': None, 'suggestion': None}
}
# Validate province
provinces_data = fetch_region_data("provinces.json")
if provinces_data and 'data' in provinces_data:
match = find_best_match(ocr_data.get('provinsi'), provinces_data['data'])
if match:
result['provinsi'] = {'valid': True, 'code': match['code'], 'suggestion': match['name']}
# Validate regency
regencies_data = fetch_region_data(f"regencies/{match['code']}.json")
if regencies_data and 'data' in regencies_data:
reg_match = find_best_match(ocr_data.get('kabupaten_kota'), regencies_data['data'])
if reg_match:
result['kabupaten_kota'] = {'valid': True, 'code': reg_match['code'], 'suggestion': reg_match['name']}
# Validate district
districts_data = fetch_region_data(f"districts/{reg_match['code']}.json")
if districts_data and 'data' in districts_data:
dist_match = find_best_match(ocr_data.get('kecamatan'), districts_data['data'])
if dist_match:
result['kecamatan'] = {'valid': True, 'code': dist_match['code'], 'suggestion': dist_match['name']}
# Validate village
villages_data = fetch_region_data(f"villages/{dist_match['code']}.json")
if villages_data and 'data' in villages_data:
vil_match = find_best_match(ocr_data.get('kel_desa'), villages_data['data'])
if vil_match:
result['kel_desa'] = {'valid': True, 'code': vil_match['code'], 'suggestion': vil_match['name']}
return jsonify({'success': True, 'validation': result})
except Exception as e:
return jsonify({'success': False, 'error': str(e)}), 500
@app.route('/health')
def health():
"""Health check endpoint"""
return jsonify({'status': 'ok'})
if __name__ == '__main__':
print("="*50)
print("OCR KTP/KK Application")
print("="*50)
print("Membuka: http://localhost:5000")
print("Tekan Ctrl+C untuk berhenti")
print("="*50)
app.run(host='0.0.0.0', port=5000, debug=True)

BIN
kk.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 161 KiB

235
kk_extractor.py Normal file
View File

@@ -0,0 +1,235 @@
"""
KK (Kartu Keluarga) Field Extractor
Ekstraksi data terstruktur dari hasil OCR KK Indonesia
"""
import re
from typing import Dict, Optional, List
class KKExtractor:
"""Ekstrak field dari hasil OCR Kartu Keluarga"""
def __init__(self):
pass
def extract(self, ocr_results: List[Dict]) -> Dict:
"""
Ekstrak field KK dari hasil OCR
Args:
ocr_results: List hasil dari OCREngine.extract_text()
Returns:
Dict dengan field KK
"""
all_text = '\n'.join([r['text'] for r in ocr_results])
result = {
'no_kk': None,
'nama_kepala_keluarga': None,
'alamat': None,
'rt_rw': None,
'kel_desa': None,
'kecamatan': None,
'kabupaten_kota': None,
'provinsi': None,
'kode_pos': None,
'anggota_keluarga': [],
}
# Ekstrak No KK (16 digit)
kk_match = re.search(r'\b(\d{16})\b', all_text)
if kk_match:
result['no_kk'] = kk_match.group(1)
# Track untuk deteksi tabel anggota
in_table = False
table_start_y = None
for i, ocr in enumerate(ocr_results):
text = ocr['text'].strip()
text_lower = text.lower()
y_pos = ocr.get('y_center', 0)
# Provinsi
if 'provinsi' in text_lower and result['provinsi'] is None:
result['provinsi'] = self._extract_value(text, 'provinsi')
# Kabupaten/Kota
if ('kabupaten' in text_lower or 'kota' in text_lower) and result['kabupaten_kota'] is None:
val = self._extract_value(text, 'kabupaten') or self._extract_value(text, 'kota')
if val:
result['kabupaten_kota'] = val
else:
result['kabupaten_kota'] = text
# Kecamatan
if 'kecamatan' in text_lower and result['kecamatan'] is None:
result['kecamatan'] = self._extract_value(text, 'kecamatan')
# Kelurahan/Desa
if ('kelurahan' in text_lower or 'desa' in text_lower) and result['kel_desa'] is None:
result['kel_desa'] = self._extract_value(text, 'kelurahan') or self._extract_value(text, 'desa')
# No. KK dengan label
if 'no' in text_lower and ('kk' in text_lower or 'kartu' in text_lower):
# Cari 16 digit di text ini atau text berikutnya
match = re.search(r'(\d{16})', text)
if match:
result['no_kk'] = match.group(1)
elif i + 1 < len(ocr_results):
next_text = ocr_results[i + 1]['text']
match = re.search(r'(\d{16})', next_text)
if match:
result['no_kk'] = match.group(1)
# Nama Kepala Keluarga
if 'kepala' in text_lower and 'keluarga' in text_lower:
result['nama_kepala_keluarga'] = self._extract_value(text, 'keluarga')
if not result['nama_kepala_keluarga'] and i + 1 < len(ocr_results):
# Nama mungkin di baris berikutnya
next_text = ocr_results[i + 1]['text'].strip()
if not any(kw in next_text.lower() for kw in ['alamat', 'rt', 'rw', 'provinsi']):
result['nama_kepala_keluarga'] = next_text
# Alamat
if 'alamat' in text_lower and result['alamat'] is None:
result['alamat'] = self._extract_value(text, 'alamat')
# RT/RW
rt_rw_match = re.search(r'rt\s*/?\s*rw\s*[:\s]*(\d+)\s*/\s*(\d+)', text_lower)
if rt_rw_match:
result['rt_rw'] = f"{rt_rw_match.group(1)}/{rt_rw_match.group(2)}"
# Kode Pos
if 'kode' in text_lower and 'pos' in text_lower:
match = re.search(r'(\d{5})', text)
if match:
result['kode_pos'] = match.group(1)
# Deteksi header tabel anggota keluarga
if self._is_table_header(text_lower):
in_table = True
table_start_y = y_pos
continue
# Ekstrak anggota keluarga dari tabel
if in_table and table_start_y:
member = self._extract_member(text, ocr_results, i)
if member:
result['anggota_keluarga'].append(member)
# Post-processing
result = self._post_process(result)
return result
def _extract_value(self, text: str, field: str) -> Optional[str]:
"""Ekstrak nilai setelah label field"""
patterns = [
rf'{field}[a-z]*\s*:\s*(.+)',
rf'{field}[a-z]*\s+(.+)',
]
for pattern in patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
value = match.group(1).strip()
value = re.sub(r'^[:\s]+', '', value)
if value:
return value
return None
def _is_table_header(self, text: str) -> bool:
"""Cek apakah teks adalah header tabel anggota"""
header_keywords = ['no', 'nama lengkap', 'nik', 'jenis kelamin', 'hubungan']
count = sum(1 for kw in header_keywords if kw in text)
return count >= 2
def _extract_member(self, text: str, all_results: List[Dict], current_idx: int) -> Optional[Dict]:
"""Ekstrak data anggota keluarga dari baris tabel"""
# Cari NIK di text
nik_match = re.search(r'\b(\d{16})\b', text)
if not nik_match:
return None
member = {
'nik': nik_match.group(1),
'nama': None,
'jenis_kelamin': None,
'tempat_lahir': None,
'tanggal_lahir': None,
'hubungan': None,
}
# Cari teks di sekitar yang mungkin nama atau info lain
text_parts = text.split()
# Deteksi jenis kelamin
if 'laki' in text.lower() or ' l ' in f' {text.lower()} ':
member['jenis_kelamin'] = 'LAKI-LAKI'
elif 'perempuan' in text.lower() or ' p ' in f' {text.lower()} ':
member['jenis_kelamin'] = 'PEREMPUAN'
# Deteksi hubungan keluarga
hubungan_keywords = {
'kepala': 'KEPALA KELUARGA',
'istri': 'ISTRI',
'suami': 'SUAMI',
'anak': 'ANAK',
'menantu': 'MENANTU',
'cucu': 'CUCU',
'orang tua': 'ORANG TUA',
'mertua': 'MERTUA',
}
for keyword, value in hubungan_keywords.items():
if keyword in text.lower():
member['hubungan'] = value
break
return member
def _post_process(self, result: Dict) -> Dict:
"""Post-processing hasil ekstraksi"""
# Validasi No KK
if result['no_kk'] and not re.match(r'^\d{16}$', result['no_kk']):
cleaned = re.sub(r'\D', '', result['no_kk'])
if len(cleaned) == 16:
result['no_kk'] = cleaned
else:
result['no_kk'] = None
# Uppercase field teks
for field in ['nama_kepala_keluarga', 'alamat', 'kel_desa', 'kecamatan',
'kabupaten_kota', 'provinsi']:
if result[field]:
result[field] = result[field].upper()
return result
if __name__ == "__main__":
# Test
sample_ocr = [
{'text': 'KARTU KELUARGA', 'y_center': 10},
{'text': 'No. 3204012345678901', 'y_center': 30},
{'text': 'Nama Kepala Keluarga : JOHN DOE', 'y_center': 50},
{'text': 'Alamat : JL. MERDEKA NO. 123', 'y_center': 70},
{'text': 'RT/RW : 001/002', 'y_center': 90},
{'text': 'Desa/Kelurahan : SUKAMAJU', 'y_center': 110},
{'text': 'Kecamatan : SUKASARI', 'y_center': 130},
{'text': 'Kabupaten/Kota : BANDUNG', 'y_center': 150},
{'text': 'Provinsi : JAWA BARAT', 'y_center': 170},
{'text': 'Kode Pos : 40154', 'y_center': 190},
]
extractor = KKExtractor()
result = extractor.extract(sample_ocr)
for key, value in result.items():
if key != 'anggota_keluarga':
print(f"{key}: {value}")
print(f"\nAnggota Keluarga: {len(result['anggota_keluarga'])} orang")

BIN
ktp.jpeg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 116 KiB

602
ktp_extractor.py Normal file
View File

@@ -0,0 +1,602 @@
"""
KTP Field Extractor
Ekstraksi data terstruktur dari hasil OCR KTP Indonesia
Mendukung berbagai format output OCR (full-width colon, standard colon, tanpa colon)
"""
import re
from typing import Dict, Optional, List
class KTPExtractor:
"""Ekstrak field dari hasil OCR KTP"""
# Pattern colon yang berbeda-beda (standard, full-width, dll)
COLON_PATTERN = r'[:\]'
# Keywords untuk jenis kelamin
MALE_KEYWORDS = ['laki', 'pria', 'male']
FEMALE_KEYWORDS = ['perempuan', 'wanita', 'female']
# Agama yang valid
AGAMA_LIST = ['islam', 'kristen', 'katolik', 'hindu', 'budha', 'buddha', 'konghucu']
# Pekerjaan umum
PEKERJAAN_LIST = ['pelajar', 'mahasiswa', 'pegawai', 'swasta', 'pns', 'wiraswasta',
'buruh', 'petani', 'nelayan', 'karyawan', 'ibu rumah tangga',
'tidak bekerja', 'lainnya', 'mengurus rumah tangga']
# KTP Zone Template (normalized coordinates: x_min, y_min, x_max, y_max)
# Based on standard KTP layout
ZONES = {
'header_provinsi': (0.15, 0.00, 0.85, 0.07), # PROVINSI header
'header_kabupaten': (0.15, 0.05, 0.85, 0.13), # KABUPATEN header
'nik': (0.02, 0.10, 0.70, 0.22), # NIK area
'nama': (0.02, 0.18, 0.70, 0.28), # Nama area
'ttl': (0.02, 0.25, 0.70, 0.36), # Tempat/Tgl Lahir
'jenis_kelamin': (0.02, 0.33, 0.45, 0.42), # Jenis Kelamin (left)
'gol_darah': (0.40, 0.33, 0.70, 0.42), # Gol Darah (right of jenis)
'alamat': (0.02, 0.38, 0.70, 0.50), # Alamat
'rt_rw': (0.02, 0.46, 0.70, 0.54), # RT/RW
'kel_desa': (0.02, 0.51, 0.70, 0.60), # Kel/Desa
'kecamatan': (0.02, 0.57, 0.70, 0.66), # Kecamatan
'agama': (0.02, 0.63, 0.70, 0.72), # Agama
'status': (0.02, 0.69, 0.70, 0.78), # Status Perkawinan
'pekerjaan': (0.02, 0.75, 0.70, 0.84), # Pekerjaan
'wni': (0.02, 0.81, 0.70, 0.90), # Kewarganegaraan
'berlaku': (0.02, 0.87, 0.70, 0.96), # Berlaku Hingga
'foto': (0.68, 0.10, 0.98, 0.55), # Foto (right side)
'penerbitan': (0.65, 0.58, 0.98, 0.98), # Tempat & Tanggal penerbitan
}
def __init__(self):
self.image_width = 0
self.image_height = 0
def _get_zone(self, x_center: float, y_center: float, img_width: int, img_height: int) -> Optional[str]:
"""Determine which zone a text belongs to based on normalized coordinates"""
if img_width == 0 or img_height == 0:
return None
# Normalize coordinates
x_norm = x_center / img_width
y_norm = y_center / img_height
for zone_name, (x_min, y_min, x_max, y_max) in self.ZONES.items():
if x_min <= x_norm <= x_max and y_min <= y_norm <= y_max:
return zone_name
return None
def _extract_value_from_text(self, text: str) -> str:
"""Extract value part from label:value text"""
# Split by colon (standard or full-width)
parts = re.split(r'[:]', text, 1)
if len(parts) > 1:
return parts[1].strip()
return text.strip()
def _detect_image_size(self, ocr_results: List[Dict]) -> tuple:
"""Detect image dimensions from bounding boxes"""
max_x, max_y = 0, 0
for r in ocr_results:
bbox = r.get('bbox', [])
if bbox and len(bbox) >= 4:
for point in bbox:
if len(point) >= 2:
max_x = max(max_x, point[0])
max_y = max(max_y, point[1])
# Add some margin
return (int(max_x * 1.05), int(max_y * 1.05)) if max_x > 0 else (1000, 640)
def _extract_by_zones(self, zone_texts: Dict[str, List[str]], result: Dict):
"""Extract fields based on zone assignments"""
# PROVINSI from header
if 'header_provinsi' in zone_texts:
for text in zone_texts['header_provinsi']:
if 'provinsi' in text.lower():
val = re.sub(r'(?i)provinsi\s*', '', text).strip()
if val:
result['provinsi'] = val.upper()
break
# KABUPATEN/KOTA from header
if 'header_kabupaten' in zone_texts:
for text in zone_texts['header_kabupaten']:
text_lower = text.lower()
if 'kabupaten' in text_lower or 'kota' in text_lower:
val = re.sub(r'(?i)(kabupaten|kota)\s*', '', text).strip()
if val:
result['kabupaten_kota'] = val.upper()
else:
result['kabupaten_kota'] = text.upper()
break
# NAMA from nama zone (skip label line)
if 'nama' in zone_texts:
for text in zone_texts['nama']:
text_lower = text.lower()
if 'nama' not in text_lower and len(text) > 2:
result['nama'] = text.upper()
break
elif 'nama' in text_lower:
val = self._extract_value_from_text(text)
if val and 'nama' not in val.lower():
result['nama'] = val.upper()
# TTL from ttl zone
if 'ttl' in zone_texts:
for text in zone_texts['ttl']:
if 'tempat' in text.lower() or 'lahir' in text.lower():
val = self._extract_value_from_text(text)
if val:
self._parse_ttl(val, result)
break
# JENIS KELAMIN
if 'jenis_kelamin' in zone_texts:
for text in zone_texts['jenis_kelamin']:
text_lower = text.lower()
if 'laki' in text_lower:
result['jenis_kelamin'] = 'LAKI-LAKI'
break
elif 'perempuan' in text_lower:
result['jenis_kelamin'] = 'PEREMPUAN'
break
# GOL DARAH
if 'gol_darah' in zone_texts:
for text in zone_texts['gol_darah']:
gol_match = re.search(r'([ABO]{1,2}[+\-]?)', text, re.IGNORECASE)
if gol_match:
result['gol_darah'] = gol_match.group(1).upper()
break
# ALAMAT
if 'alamat' in zone_texts:
for text in zone_texts['alamat']:
if 'alamat' not in text.lower() or len(zone_texts['alamat']) == 1:
val = self._extract_value_from_text(text) if 'alamat' in text.lower() else text
if val and 'alamat' not in val.lower():
result['alamat'] = val.upper()
break
# PENERBITAN area (tempat & tanggal dalam satu zona)
if 'penerbitan' in zone_texts:
for text in zone_texts['penerbitan']:
# Look for date
date_match = re.search(r'(\d{2}[-/]\d{2}[-/]\d{4})', text)
if date_match and result['tanggal_penerbitan'] is None:
result['tanggal_penerbitan'] = date_match.group(1)
def extract(self, ocr_results: List[Dict]) -> Dict[str, Optional[str]]:
"""
Ekstrak field KTP dari hasil OCR dengan template-based zone detection
Args:
ocr_results: List hasil dari OCREngine.extract_text()
Returns:
Dict dengan field KTP
"""
result = {
'nik': None,
'nama': None,
'tempat_lahir': None,
'tanggal_lahir': None,
'jenis_kelamin': None,
'gol_darah': None,
'alamat': None,
'rt_rw': None,
'kel_desa': None,
'kecamatan': None,
'agama': None,
'status_perkawinan': None,
'pekerjaan': None,
'kewarganegaraan': None,
'berlaku_hingga': None,
'provinsi': None,
'kabupaten_kota': None,
'tanggal_penerbitan': None,
}
# Detect image dimensions from bounding boxes
img_width, img_height = self._detect_image_size(ocr_results)
# Assign zones to each OCR result
zone_texts = {} # zone_name -> list of texts
for r in ocr_results:
x_center = r.get('x_center', 0)
y_center = r.get('y_center', 0)
zone = self._get_zone(x_center, y_center, img_width, img_height)
if zone:
if zone not in zone_texts:
zone_texts[zone] = []
zone_texts[zone].append(r['text'])
# Debug: print zone assignments
print("\n[DEBUG KTPExtractor] Zone assignments:")
for zone, texts in zone_texts.items():
print(f" {zone}: {texts}")
# Extract fields using zone-based approach
self._extract_by_zones(zone_texts, result)
# Gabungkan semua teks untuk fallback pattern matching
texts = [r['text'].strip() for r in ocr_results]
all_text = '\n'.join(texts)
# Ekstrak NIK (16 digit) - bisa ada di mana saja
nik_match = re.search(r'\b(\d{16})\b', all_text)
if nik_match:
result['nik'] = nik_match.group(1)
print(f" -> NIK found: {result['nik']}")
# Fallback: Parse line by line for fields not found by zone
for i, text in enumerate(texts):
text_lower = text.lower()
# Normalize colons
text_normalized = re.sub(self.COLON_PATTERN, ':', text)
text_norm_lower = text_normalized.lower()
# ===== PROVINSI =====
if 'provinsi' in text_lower and result['provinsi'] is None:
val = self._extract_after_label(text_normalized, 'provinsi')
if val:
result['provinsi'] = val.upper()
elif i + 1 < len(texts) and 'provinsi' not in texts[i+1].lower():
# Mungkin value di line berikutnya
result['provinsi'] = texts[i+1].strip().upper()
# ===== KABUPATEN/KOTA =====
if ('kabupaten' in text_lower or 'kota' in text_lower or 'jakarta' in text_lower) and result['kabupaten_kota'] is None:
if 'provinsi' not in text_lower: # Bukan bagian dari provinsi
val = self._extract_after_label(text_normalized, 'kabupaten|kota')
if val:
result['kabupaten_kota'] = val.upper()
else:
result['kabupaten_kota'] = text.strip().upper()
# ===== NAMA =====
if 'nama' in text_lower and result['nama'] is None:
val = self._extract_after_label(text_normalized, 'nama')
if val and len(val) > 2:
result['nama'] = val.upper()
elif i + 1 < len(texts):
# Nama di line berikutnya
next_text = texts[i+1].strip()
if len(next_text) > 2 and not any(kw in next_text.lower() for kw in ['tempat', 'lahir', 'jenis']):
result['nama'] = next_text.upper()
# ===== TEMPAT/TANGGAL LAHIR =====
# Match "Tempat/Tgl Lahir" or "Tempat Lahir" or similar labels
if 'tempat' in text_lower or ('lahir' in text_lower and 'berlaku' not in text_lower):
if result['tempat_lahir'] is None or result['tanggal_lahir'] is None:
# Extract value after label using full-width or standard colon
ttl = self._extract_after_label(text_normalized, r'tempat[/\s]*tgl[/\s]*lahir|tempat[/\s]*lahir|lahir')
if ttl:
self._parse_ttl(ttl, result)
elif '' in text or ':' in text:
# Value is after colon but _extract_after_label didn't catch it
parts = re.split(r'[:]', text, 1)
if len(parts) > 1 and parts[1].strip():
self._parse_ttl(parts[1].strip(), result)
elif i + 1 < len(texts):
# TTL di line berikutnya
next_text = texts[i+1].strip()
if not any(kw in next_text.lower() for kw in ['jenis', 'kelamin', 'alamat', 'gol']):
self._parse_ttl(next_text, result)
# ===== JENIS KELAMIN =====
if any(kw in text_lower for kw in self.MALE_KEYWORDS):
if result['jenis_kelamin'] is None:
result['jenis_kelamin'] = 'LAKI-LAKI'
elif any(kw in text_lower for kw in self.FEMALE_KEYWORDS):
if result['jenis_kelamin'] is None:
result['jenis_kelamin'] = 'PEREMPUAN'
# ===== GOLONGAN DARAH =====
if 'darah' in text_lower or 'gol.' in text_lower:
# Try to find blood type on same line
gol_match = re.search(r'(?:gol|darah)[.\s:]*([ABO]{1,2}[+\-]?)', text, re.IGNORECASE)
if gol_match and result['gol_darah'] is None:
result['gol_darah'] = gol_match.group(1).upper()
elif result['gol_darah'] is None and i + 1 < len(texts):
# Blood type might be on next line (real KTP pattern)
next_text = texts[i+1].strip()
if re.match(r'^[ABO]{1,2}[+\-]?$', next_text, re.IGNORECASE):
result['gol_darah'] = next_text.upper()
# Standalone blood type (e.g., just "O" or "A+" on its own line)
if result['gol_darah'] is None:
if re.match(r'^[ABO]{1,2}[+\-]?$', text.strip(), re.IGNORECASE) and len(text.strip()) <= 3:
result['gol_darah'] = text.strip().upper()
# ===== ALAMAT =====
if 'alamat' in text_lower and result['alamat'] is None:
val = self._extract_after_label(text_normalized, 'alamat')
if val:
result['alamat'] = val.upper()
elif i + 1 < len(texts):
result['alamat'] = texts[i+1].strip().upper()
# ===== RT/RW =====
rt_rw_match = re.search(r'(\d{3})\s*/\s*(\d{3})', text)
if rt_rw_match:
result['rt_rw'] = f"{rt_rw_match.group(1)}/{rt_rw_match.group(2)}"
# ===== KELURAHAN/DESA =====
if ('kel' in text_lower or 'desa' in text_lower) and 'kelamin' not in text_lower:
if result['kel_desa'] is None:
val = self._extract_after_label(text_normalized, 'kel|desa')
if val:
result['kel_desa'] = val.upper()
elif i + 1 < len(texts):
result['kel_desa'] = texts[i+1].strip().upper()
# ===== KECAMATAN =====
if 'kecamatan' in text_lower or ('kec' in text_lower and 'kelamin' not in text_lower):
if result['kecamatan'] is None:
val = self._extract_after_label(text_normalized, 'kecamatan|kec')
if val:
result['kecamatan'] = val.upper()
elif i + 1 < len(texts):
# Value on next line (real KTP pattern)
next_text = texts[i+1].strip()
if len(next_text) > 2 and not any(kw in next_text.lower() for kw in ['agama', 'status', 'pekerjaan']):
result['kecamatan'] = next_text.upper()
# ===== AGAMA =====
if 'agama' in text_lower:
val = self._extract_after_label(text_normalized, 'agama')
if val and result['agama'] is None:
result['agama'] = val.upper()
elif result['agama'] is None and i + 1 < len(texts):
# Value on next line (real KTP pattern)
next_text = texts[i+1].strip().upper()
if next_text in ['ISLAM', 'KRISTEN', 'KATOLIK', 'HINDU', 'BUDHA', 'BUDDHA', 'KONGHUCU']:
result['agama'] = next_text
else:
# Check if line contains only agama name
for agama in self.AGAMA_LIST:
if agama in text_lower and len(text) < 20:
if result['agama'] is None:
result['agama'] = text.strip().upper()
break
# ===== STATUS PERKAWINAN =====
if 'kawin' in text_lower:
if result['status_perkawinan'] is None:
val = self._extract_after_label(text_normalized, 'status.*kawin|perkawinan')
if val:
result['status_perkawinan'] = val.upper()
elif 'belum' in text_lower:
result['status_perkawinan'] = 'BELUM KAWIN'
elif 'kawin' in text_lower and 'cerai' not in text_lower:
result['status_perkawinan'] = 'KAWIN'
elif 'cerai hidup' in text_lower:
result['status_perkawinan'] = 'CERAI HIDUP'
elif 'cerai mati' in text_lower:
result['status_perkawinan'] = 'CERAI MATI'
# ===== PEKERJAAN =====
if 'pekerjaan' in text_lower:
val = self._extract_after_label(text_normalized, 'pekerjaan')
if val and result['pekerjaan'] is None:
result['pekerjaan'] = val.upper()
elif result['pekerjaan'] is None and i + 1 < len(texts):
# Value on next line (real KTP pattern)
next_text = texts[i+1].strip()
if len(next_text) > 2 and 'kewarganegaraan' not in next_text.lower():
result['pekerjaan'] = next_text.upper()
else:
# Check if line contains pekerjaan keyword
for pekerjaan in self.PEKERJAAN_LIST:
if pekerjaan in text_lower and len(text) < 30:
if result['pekerjaan'] is None:
result['pekerjaan'] = text.strip().upper()
break
# ===== KEWARGANEGARAAN =====
if 'wni' in text_lower:
result['kewarganegaraan'] = 'WNI'
elif 'wna' in text_lower:
result['kewarganegaraan'] = 'WNA'
elif 'warga' in text_lower and result['kewarganegaraan'] is None:
val = self._extract_after_label(text_normalized, 'kewarganegaraan|warga')
if val:
result['kewarganegaraan'] = val.upper()
# ===== BERLAKU HINGGA =====
if 'berlaku' in text_lower or 'seumur' in text_lower:
if result['berlaku_hingga'] is None:
if 'seumur' in text_lower or 'hidup' in text_lower:
result['berlaku_hingga'] = 'SEUMUR HIDUP'
else:
val = self._extract_after_label(text_normalized, 'berlaku')
if val:
result['berlaku_hingga'] = val.upper()
# ===== TANGGAL PENERBITAN (biasanya format DD-MM-YYYY di akhir) =====
# Look for date that is NOT tanggal lahir (different date)
if result['tanggal_penerbitan'] is None:
# Match date format at end of text or standalone date
date_match = re.search(r'(\d{2}[-/]\d{2}[-/]\d{4})$', text.strip())
if date_match:
found_date = date_match.group(1)
# Make sure it's not the same as tanggal_lahir
if result['tanggal_lahir'] != found_date:
# Likely penerbitan if after berlaku_hingga was found
if result['berlaku_hingga'] or i > len(texts) * 0.7:
result['tanggal_penerbitan'] = found_date
# Post-processing
result = self._post_process(result)
return result
def _extract_after_label(self, text: str, label_pattern: str) -> Optional[str]:
"""Ekstrak nilai setelah label (supports various separators)"""
patterns = [
rf'(?:{label_pattern})\s*:\s*(.+)', # label: value
rf'(?:{label_pattern})\s+([A-Z0-9].+)', # label VALUE (uppercase start)
]
for pattern in patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
value = match.group(1).strip()
# Remove trailing colon or label fragment
value = re.sub(r'^[:\s]+', '', value)
value = re.sub(r'\s*:\s*$', '', value)
if value and len(value) > 1:
return value
return None
def _parse_ttl(self, ttl_text: str, result: Dict):
"""Parse tempat/tanggal lahir dari text"""
ttl_text = ttl_text.strip()
# Normalize dates where OCR missed dashes:
# "05 08 1978" -> "05-08-1978"
# "05 08-1978" -> "05-08-1978"
# "05-08 1978" -> "05-08-1978"
ttl_text = re.sub(r'(\d{2})[\s]+(\d{2})[\s]+(\d{4})', r'\1-\2-\3', ttl_text)
ttl_text = re.sub(r'(\d{2})[\s]+(\d{2})[-/](\d{4})', r'\1-\2-\3', ttl_text)
ttl_text = re.sub(r'(\d{2})[-/](\d{2})[\s]+(\d{4})', r'\1-\2-\3', ttl_text)
# Handle 8-digit date without separator: "05081978" -> "05-08-1978"
date_8digit = re.search(r'(\d{8})', ttl_text)
if date_8digit:
d = date_8digit.group(1)
formatted = f"{d[:2]}-{d[2:4]}-{d[4:]}"
ttl_text = ttl_text.replace(d, formatted)
# Handle merged city+date like "JAKARTA05-08-1978" - add space before digits
ttl_text = re.sub(r'([A-Z])(\d{2}[-/])', r'\1 \2', ttl_text, flags=re.IGNORECASE)
# Format: "TEMPAT, DD-MM-YYYY" atau "TEMPAT DD-MM-YYYY"
date_match = re.search(r'(\d{2}[-/]\d{2}[-/]\d{4})', ttl_text)
if date_match:
result['tanggal_lahir'] = date_match.group(1)
# Tempat adalah bagian sebelum tanggal
place = ttl_text[:date_match.start()].strip(' ,:-/')
# Clean up label remnants
place = re.sub(r'^(tempat|tgl|lahir||:)[/\s:]*', '', place, flags=re.IGNORECASE).strip()
if place and len(place) > 2:
result['tempat_lahir'] = place.upper()
else:
# Coba split by comma
parts = ttl_text.split(',')
if len(parts) >= 2:
result['tempat_lahir'] = parts[0].strip().upper()
result['tanggal_lahir'] = parts[1].strip()
elif len(parts) == 1 and len(ttl_text) > 2:
result['tempat_lahir'] = ttl_text.upper()
def _post_process(self, result: Dict) -> Dict:
"""Post-processing hasil ekstraksi"""
# Validasi NIK (harus 16 digit)
if result['nik'] and not re.match(r'^\d{16}$', result['nik']):
cleaned = re.sub(r'\D', '', result['nik'])
if len(cleaned) == 16:
result['nik'] = cleaned
else:
result['nik'] = None
# Clean all string values - remove leading colons and extra whitespace
for field in result:
if result[field] and isinstance(result[field], str):
val = result[field]
# Remove leading colons (standard and full-width)
val = re.sub(r'^[\s:]+', '', val)
# Remove trailing colons
val = re.sub(r'[\s:]+$', '', val)
# Remove double spaces
val = re.sub(r'\s+', ' ', val)
result[field] = val.strip()
# Bersihkan label dari values
for field in ['nama', 'alamat', 'tempat_lahir', 'kel_desa', 'kecamatan', 'agama', 'pekerjaan']:
if result[field]:
# Remove common labels yang ter-capture
result[field] = re.sub(
r'^(NAMA|ALAMAT|TEMPAT|LAHIR|TGL|KEL|DESA|KELURAHAN|KECAMATAN|KEC|AGAMA|PEKERJAAN|STATUS)[\s:]*',
'', result[field], flags=re.IGNORECASE
).strip()
# Fix status perkawinan yang masih mengandung label
if result['status_perkawinan']:
sp = result['status_perkawinan']
sp = re.sub(r'^(STATUS|PERKAWINAN)[\s:]*', '', sp, flags=re.IGNORECASE).strip()
result['status_perkawinan'] = sp
# Fix berlaku hingga
if result['berlaku_hingga']:
bh = result['berlaku_hingga']
bh = re.sub(r'^(BERLAKU|HINGGA)[\s:]*', '', bh, flags=re.IGNORECASE).strip()
if bh.upper() == 'HIDUP' or 'SEUMUR' in bh.upper():
result['berlaku_hingga'] = 'SEUMUR HIDUP'
else:
result['berlaku_hingga'] = bh
# Fix merged kabupaten/kota names (e.g., JAKARTASELATAN -> JAKARTA SELATAN)
if result['kabupaten_kota']:
kk = result['kabupaten_kota']
# Add space before directional words
kk = re.sub(r'(JAKARTA|BANDUNG|SURABAYA|SEMARANG|MEDAN|BEKASI|TANGERANG|DEPOK|BOGOR)(SELATAN|UTARA|BARAT|TIMUR|PUSAT|TENGAH)',
r'\1 \2', kk, flags=re.IGNORECASE)
# Common merged patterns
kk = re.sub(r'(KOTA|KABUPATEN)([A-Z])', r'\1 \2', kk, flags=re.IGNORECASE)
result['kabupaten_kota'] = kk.upper()
# Fix merged provinsi names
if result['provinsi']:
prov = result['provinsi']
prov = re.sub(r'(DKI|DI)(JAKARTA|YOGYAKARTA)', r'\1 \2', prov, flags=re.IGNORECASE)
prov = re.sub(r'(JAWA|KALIMANTAN|SULAWESI|SUMATERA|NUSA TENGGARA)(BARAT|TIMUR|TENGAH|SELATAN|UTARA)',
r'\1 \2', prov, flags=re.IGNORECASE)
result['provinsi'] = prov.upper()
# Fix merged alamat/address (e.g., JLKECAPIV -> JL KECAPI V)
if result['alamat']:
alamat = result['alamat']
# Add space after common street prefixes
alamat = re.sub(r'^(JL|JLN|JALAN|GG|GANG|NO|BLOK)([A-Z])', r'\1 \2', alamat, flags=re.IGNORECASE)
# Add space before Roman numerals at the end (I, II, III, IV, V, VI, VII, VIII, IX, X)
alamat = re.sub(r'([A-Z])([IVX]+)$', r'\1 \2', alamat, flags=re.IGNORECASE)
# Add space before single digits/numbers at end
alamat = re.sub(r'([A-Z])(\d+)$', r'\1 \2', alamat, flags=re.IGNORECASE)
# Fix common patterns: "NO123" -> "NO 123", "BLOKA" -> "BLOK A"
alamat = re.sub(r'\b(NO|BLOK)(\d+|[A-Z])\b', r'\1 \2', alamat, flags=re.IGNORECASE)
result['alamat'] = alamat.upper()
return result
if __name__ == "__main__":
# Test
sample_ocr = [
{'text': 'PROVINSI JAWA BARAT'},
{'text': 'KABUPATEN BANDUNG'},
{'text': 'NIK : 3204012345678901'},
{'text': 'Nama : JOHN DOE'},
{'text': 'Tempat/Tgl Lahir : BANDUNG, 01-01-1990'},
{'text': 'Jenis Kelamin : LAKI-LAKI'},
{'text': 'Alamat : JL. MERDEKA NO. 123'},
{'text': 'RT/RW : 001/002'},
{'text': 'Kel/Desa : SUKAMAJU'},
{'text': 'Kecamatan : SUKASARI'},
{'text': 'Agama : ISLAM'},
{'text': 'Status Perkawinan : BELUM KAWIN'},
{'text': 'Pekerjaan : KARYAWAN SWASTA'},
{'text': 'Kewarganegaraan : WNI'},
{'text': 'Berlaku Hingga : SEUMUR HIDUP'},
]
extractor = KTPExtractor()
result = extractor.extract(sample_ocr)
for key, value in result.items():
print(f"{key}: {value}")

153
ocr_engine.py Normal file
View File

@@ -0,0 +1,153 @@
"""
OCR Engine menggunakan PaddleOCR 3.x
Untuk membaca teks dari gambar dokumen Indonesia (KTP, KK)
"""
from paddleocr import PaddleOCR
import cv2
import numpy as np
from PIL import Image
class OCREngine:
def __init__(self):
"""Inisialisasi PaddleOCR 3.x dengan konfigurasi untuk dokumen Indonesia"""
self.ocr = PaddleOCR(
use_doc_orientation_classify=True, # Deteksi rotasi (0°/90°/180°/270°)
use_doc_unwarping=True, # Koreksi perspektif (trapezium → persegi)
use_textline_orientation=True, # Orientasi per baris teks
)
def preprocess_image(self, image_path: str) -> np.ndarray:
"""
Preprocessing gambar untuk hasil OCR lebih baik
- Resize jika terlalu besar
- Enhance contrast
"""
img = cv2.imread(image_path)
if img is None:
raise ValueError(f"Tidak dapat membaca gambar: {image_path}")
# Resize jika terlalu besar (max 2000px)
max_dim = 2000
height, width = img.shape[:2]
if max(height, width) > max_dim:
scale = max_dim / max(height, width)
img = cv2.resize(img, None, fx=scale, fy=scale, interpolation=cv2.INTER_AREA)
# Convert ke grayscale untuk preprocessing
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# Enhance contrast menggunakan CLAHE
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
enhanced = clahe.apply(gray)
# Convert kembali ke BGR untuk PaddleOCR
enhanced_bgr = cv2.cvtColor(enhanced, cv2.COLOR_GRAY2BGR)
return enhanced_bgr
def extract_text(self, image_path: str, preprocess: bool = False) -> list:
"""
Ekstraksi teks dari gambar menggunakan PaddleOCR 3.x API
Args:
image_path: Path ke file gambar
preprocess: Apakah melakukan preprocessing
Returns:
List of dict dengan keys: 'text', 'confidence', 'bbox'
"""
try:
# Jalankan OCR dengan API baru (predict)
result = self.ocr.predict(input=image_path)
if not result:
return []
extracted = []
# Parse hasil dari PaddleOCR 3.x
for res in result:
# Akses data dari result object
if hasattr(res, 'rec_texts') and hasattr(res, 'rec_scores') and hasattr(res, 'dt_polys'):
texts = res.rec_texts if res.rec_texts else []
scores = res.rec_scores if res.rec_scores else []
polys = res.dt_polys if res.dt_polys else []
for i, text in enumerate(texts):
confidence = scores[i] if i < len(scores) else 0.0
bbox = polys[i].tolist() if i < len(polys) and hasattr(polys[i], 'tolist') else []
# Calculate center for sorting
if bbox and len(bbox) >= 4:
y_center = (bbox[0][1] + bbox[2][1]) / 2
x_center = (bbox[0][0] + bbox[2][0]) / 2
else:
y_center = 0
x_center = 0
extracted.append({
'text': text,
'confidence': float(confidence),
'bbox': bbox,
'y_center': y_center,
'x_center': x_center,
})
# Fallback: try dict-like access
elif hasattr(res, '__getitem__'):
try:
texts = res.get('rec_texts', res.get('texts', []))
scores = res.get('rec_scores', res.get('scores', []))
for i, text in enumerate(texts):
confidence = scores[i] if i < len(scores) else 0.0
extracted.append({
'text': text,
'confidence': float(confidence),
'bbox': [],
'y_center': i * 10, # Simple ordering fallback
'x_center': 0,
})
except Exception:
pass
# Sort berdasarkan posisi Y (atas ke bawah)
if extracted:
extracted.sort(key=lambda x: (x['y_center'], x['x_center']))
return extracted
except Exception as e:
print(f"Error OCR: {e}")
import traceback
traceback.print_exc()
return []
def get_raw_text(self, image_path: str) -> str:
"""
Mendapatkan semua teks dari gambar sebagai string
"""
results = self.extract_text(image_path)
return '\n'.join([r['text'] for r in results])
# Singleton instance
_ocr_engine = None
def get_ocr_engine() -> OCREngine:
"""Get singleton OCR engine instance"""
global _ocr_engine
if _ocr_engine is None:
_ocr_engine = OCREngine()
return _ocr_engine
if __name__ == "__main__":
# Test OCR
import sys
if len(sys.argv) > 1:
engine = get_ocr_engine()
results = engine.extract_text(sys.argv[1])
for r in results:
print(f"[{r['confidence']:.2f}] {r['text']}")

5
requirements.txt Normal file
View File

@@ -0,0 +1,5 @@
paddlepaddle
paddleocr
flask
pillow
opencv-python

538
static/style.css Normal file
View File

@@ -0,0 +1,538 @@
/* OCR KTP/KK - Modern Dark Theme */
:root {
--bg-primary: #0f0f1a;
--bg-secondary: #1a1a2e;
--bg-tertiary: #252540;
--accent-primary: #6366f1;
--accent-secondary: #818cf8;
--accent-gradient: linear-gradient(135deg, #6366f1 0%, #a855f7 100%);
--text-primary: #f1f5f9;
--text-secondary: #94a3b8;
--text-muted: #64748b;
--success: #22c55e;
--error: #ef4444;
--warning: #f59e0b;
--border: #334155;
--shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.3);
--shadow-lg: 0 10px 15px -3px rgba(0, 0, 0, 0.4);
--radius: 12px;
--radius-lg: 16px;
}
* {
margin: 0;
padding: 0;
box-sizing: border-box;
}
body {
font-family: 'Segoe UI', system-ui, -apple-system, sans-serif;
background: var(--bg-primary);
color: var(--text-primary);
min-height: 100vh;
line-height: 1.6;
}
.container {
max-width: 800px;
margin: 0 auto;
padding: 2rem 1rem;
}
/* Header */
header {
text-align: center;
margin-bottom: 2rem;
}
header h1 {
font-size: 2.5rem;
font-weight: 700;
background: var(--accent-gradient);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
background-clip: text;
margin-bottom: 0.5rem;
}
.subtitle {
color: var(--text-secondary);
font-size: 1.1rem;
}
/* Upload Section */
.upload-section {
background: var(--bg-secondary);
border-radius: var(--radius-lg);
padding: 2rem;
box-shadow: var(--shadow-lg);
margin-bottom: 2rem;
}
/* Document Type Selector */
.doc-type-selector {
display: flex;
gap: 1rem;
margin-bottom: 1.5rem;
}
.doc-btn {
flex: 1;
display: flex;
align-items: center;
justify-content: center;
gap: 0.5rem;
padding: 1rem;
background: var(--bg-tertiary);
border: 2px solid transparent;
border-radius: var(--radius);
color: var(--text-secondary);
font-size: 1rem;
font-weight: 600;
cursor: pointer;
transition: all 0.3s ease;
}
.doc-btn:hover {
background: var(--bg-primary);
color: var(--text-primary);
}
.doc-btn.active {
background: var(--accent-gradient);
color: white;
border-color: var(--accent-secondary);
}
.doc-btn .icon {
font-size: 1.5rem;
}
/* Dropzone */
.dropzone {
border: 2px dashed var(--border);
border-radius: var(--radius);
padding: 3rem 2rem;
text-align: center;
cursor: pointer;
transition: all 0.3s ease;
background: var(--bg-tertiary);
position: relative;
overflow: hidden;
}
.dropzone:hover,
.dropzone.dragover {
border-color: var(--accent-primary);
background: rgba(99, 102, 241, 0.1);
}
.dropzone-content {
display: flex;
flex-direction: column;
align-items: center;
gap: 0.5rem;
}
.upload-icon {
font-size: 4rem;
margin-bottom: 0.5rem;
}
.dropzone p {
color: var(--text-secondary);
}
.dropzone .hint {
color: var(--text-muted);
font-size: 0.875rem;
}
.file-btn {
display: inline-block;
padding: 0.75rem 1.5rem;
background: var(--accent-gradient);
color: white;
border-radius: var(--radius);
font-weight: 600;
cursor: pointer;
margin: 0.5rem 0;
transition: transform 0.2s ease;
}
.file-btn:hover {
transform: scale(1.05);
}
.file-types {
font-size: 0.75rem;
color: var(--text-muted);
}
.preview-image {
max-width: 100%;
max-height: 400px;
border-radius: var(--radius);
cursor: pointer;
}
/* Process Button */
.process-btn {
width: 100%;
padding: 1rem;
margin-top: 1.5rem;
background: var(--accent-gradient);
border: none;
border-radius: var(--radius);
color: white;
font-size: 1.1rem;
font-weight: 600;
cursor: pointer;
transition: all 0.3s ease;
box-shadow: var(--shadow);
}
.process-btn:hover:not(:disabled) {
transform: translateY(-2px);
box-shadow: var(--shadow-lg);
}
.process-btn:disabled {
opacity: 0.5;
cursor: not-allowed;
}
/* Results Section */
.results-section {
background: var(--bg-secondary);
border-radius: var(--radius-lg);
padding: 2rem;
box-shadow: var(--shadow-lg);
animation: slideUp 0.3s ease;
}
@keyframes slideUp {
from {
opacity: 0;
transform: translateY(20px);
}
to {
opacity: 1;
transform: translateY(0);
}
}
.results-header {
display: flex;
justify-content: space-between;
align-items: center;
margin-bottom: 1.5rem;
flex-wrap: wrap;
gap: 1rem;
}
.results-header h2 {
font-size: 1.5rem;
}
.results-actions {
display: flex;
gap: 0.5rem;
}
.action-btn {
padding: 0.5rem 1rem;
background: var(--bg-tertiary);
border: 1px solid var(--border);
border-radius: var(--radius);
color: var(--text-primary);
font-size: 0.875rem;
cursor: pointer;
transition: all 0.2s ease;
}
.action-btn:hover {
background: var(--accent-primary);
border-color: var(--accent-primary);
}
.action-btn.secondary {
background: transparent;
}
/* Results Table */
.results-table {
width: 100%;
border-collapse: collapse;
}
.results-table th,
.results-table td {
padding: 0.875rem 1rem;
text-align: left;
border-bottom: 1px solid var(--border);
}
.results-table th {
background: var(--bg-tertiary);
color: var(--text-secondary);
font-weight: 600;
font-size: 0.875rem;
text-transform: uppercase;
letter-spacing: 0.05em;
}
.results-table th:first-child {
border-radius: var(--radius) 0 0 0;
}
.results-table th:last-child {
border-radius: 0 var(--radius) 0 0;
}
.field-label {
color: var(--text-secondary);
font-weight: 500;
width: 40%;
}
.field-value {
color: var(--text-primary);
font-weight: 600;
}
.results-table tr:hover {
background: rgba(99, 102, 241, 0.05);
}
/* Editable Fields */
.editable-field {
width: 100%;
padding: 0.5rem 0.75rem;
background: var(--bg-tertiary);
border: 1px solid var(--border);
border-radius: 6px;
color: var(--text-primary);
font-size: 0.95rem;
font-weight: 600;
font-family: inherit;
transition: all 0.2s ease;
}
.editable-field:focus {
outline: none;
border-color: var(--accent-primary);
background: var(--bg-secondary);
box-shadow: 0 0 0 3px rgba(99, 102, 241, 0.2);
}
.editable-field::placeholder {
color: var(--text-muted);
font-weight: 400;
}
/* Region Dropdown Styles */
.region-field-wrapper {
display: flex;
gap: 0.5rem;
align-items: center;
}
.region-field-wrapper input,
.region-field-wrapper select {
flex: 1;
}
.region-dropdown {
width: 100%;
padding: 0.5rem 0.75rem;
background: var(--bg-tertiary);
border: 1px solid var(--border);
border-radius: 6px;
color: var(--text-primary);
font-size: 0.95rem;
font-family: inherit;
cursor: pointer;
}
.region-dropdown:focus {
outline: none;
border-color: var(--accent-primary);
}
.dropdown-toggle {
padding: 0.5rem 0.75rem;
background: var(--bg-tertiary);
border: 1px solid var(--border);
border-radius: 6px;
color: var(--text-secondary);
cursor: pointer;
transition: all 0.2s ease;
flex-shrink: 0;
}
.dropdown-toggle:hover {
background: var(--accent-primary);
color: white;
}
.dropdown-toggle.confirmed {
background: var(--success);
color: white;
border-color: var(--success);
}
/* Validation Indicators */
.validation-status {
margin-left: 0.5rem;
font-size: 0.875rem;
}
.validation-status.valid-field {
color: var(--success);
}
.validation-status.invalid-field {
color: var(--warning);
}
.editable-field.valid-field {
border-color: var(--success);
}
.editable-field.invalid-field {
border-color: var(--warning);
}
.suggestion-text {
font-size: 0.75rem;
color: var(--text-muted);
margin-top: 0.25rem;
font-style: italic;
}
/* Raw Text Section */
.raw-text-section {
margin-top: 1.5rem;
padding-top: 1.5rem;
border-top: 1px solid var(--border);
}
.raw-text-section h3 {
font-size: 1rem;
color: var(--text-secondary);
margin-bottom: 1rem;
}
.raw-text-section pre {
background: var(--bg-primary);
padding: 1rem;
border-radius: var(--radius);
font-family: 'Consolas', monospace;
font-size: 0.875rem;
color: var(--text-secondary);
white-space: pre-wrap;
word-wrap: break-word;
max-height: 300px;
overflow-y: auto;
}
/* Error Section */
.error-section {
margin-top: 1rem;
}
.error-content {
background: rgba(239, 68, 68, 0.1);
border: 1px solid var(--error);
border-radius: var(--radius);
padding: 1rem;
display: flex;
align-items: center;
gap: 0.75rem;
}
.error-icon {
font-size: 1.5rem;
}
.error-content p {
color: var(--error);
}
/* Footer */
footer {
text-align: center;
margin-top: 2rem;
padding-top: 1rem;
border-top: 1px solid var(--border);
}
footer p {
color: var(--text-muted);
font-size: 0.875rem;
}
footer a {
color: var(--accent-secondary);
text-decoration: none;
}
footer a:hover {
text-decoration: underline;
}
/* Responsive */
@media (max-width: 600px) {
.container {
padding: 1rem;
}
header h1 {
font-size: 2rem;
}
.upload-section,
.results-section {
padding: 1.5rem;
}
.doc-type-selector {
flex-direction: column;
}
.results-header {
flex-direction: column;
align-items: flex-start;
}
.results-actions {
width: 100%;
justify-content: flex-start;
}
.field-label {
width: 45%;
}
}
/* Scrollbar */
::-webkit-scrollbar {
width: 8px;
height: 8px;
}
::-webkit-scrollbar-track {
background: var(--bg-tertiary);
}
::-webkit-scrollbar-thumb {
background: var(--border);
border-radius: 4px;
}
::-webkit-scrollbar-thumb:hover {
background: var(--text-muted);
}

570
templates/index.html Normal file
View File

@@ -0,0 +1,570 @@
<!DOCTYPE html>
<html lang="id">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>OCR KTP/KK - Pembaca Dokumen Indonesia</title>
<link rel="stylesheet" href="/static/style.css">
</head>
<body>
<div class="container">
<header>
<h1>📄 OCR KTP/KK</h1>
<p class="subtitle">Pembaca Dokumen Indonesia Offline</p>
</header>
<main>
<!-- Upload Section -->
<section class="upload-section">
<div class="doc-type-selector">
<button class="doc-btn active" data-type="ktp">
<span class="icon">🪪</span>
KTP
</button>
<button class="doc-btn" data-type="kk">
<span class="icon">👨‍👩‍👧‍👦</span>
Kartu Keluarga
</button>
</div>
<div class="dropzone" id="dropzone">
<div class="dropzone-content">
<div class="upload-icon">📷</div>
<p>Drag & drop gambar di sini</p>
<p class="hint">atau</p>
<label class="file-btn">
Pilih File
<input type="file" id="fileInput" accept="image/*" hidden>
</label>
<p class="file-types">PNG, JPG, JPEG, BMP, WEBP (max 16MB)</p>
</div>
<img id="preview" class="preview-image" style="display: none;">
</div>
<button id="processBtn" class="process-btn" disabled>
<span class="btn-text">🔍 Proses OCR</span>
<span class="btn-loading" style="display: none;">⏳ Memproses...</span>
</button>
</section>
<!-- Results Section -->
<section class="results-section" id="resultsSection" style="display: none;">
<div class="results-header">
<h2>📋 Hasil Ekstraksi</h2>
<div class="results-actions">
<button class="action-btn" id="copyBtn" title="Copy JSON">📋 Copy</button>
<button class="action-btn" id="exportBtn" title="Export JSON">💾 Export</button>
<button class="action-btn secondary" id="toggleRaw">📝 Raw Text</button>
</div>
</div>
<div class="results-content">
<table class="results-table" id="resultsTable">
<thead>
<tr>
<th>Field</th>
<th>Nilai</th>
</tr>
</thead>
<tbody id="resultsBody">
</tbody>
</table>
<div class="raw-text-section" id="rawTextSection" style="display: none;">
<h3>Raw OCR Text</h3>
<pre id="rawText"></pre>
</div>
</div>
</section>
<!-- Error Section -->
<section class="error-section" id="errorSection" style="display: none;">
<div class="error-content">
<span class="error-icon">⚠️</span>
<p id="errorMessage"></p>
</div>
</section>
</main>
<footer>
<p>OCR menggunakan <a href="https://github.com/PaddlePaddle/PaddleOCR" target="_blank">PaddleOCR</a> • Data
diproses secara lokal</p>
</footer>
</div>
<script>
// State
let selectedFile = null;
let docType = 'ktp';
let extractedData = null;
// Elements
const dropzone = document.getElementById('dropzone');
const fileInput = document.getElementById('fileInput');
const preview = document.getElementById('preview');
const processBtn = document.getElementById('processBtn');
const resultsSection = document.getElementById('resultsSection');
const resultsBody = document.getElementById('resultsBody');
const rawText = document.getElementById('rawText');
const rawTextSection = document.getElementById('rawTextSection');
const errorSection = document.getElementById('errorSection');
const errorMessage = document.getElementById('errorMessage');
const docBtns = document.querySelectorAll('.doc-btn');
// Field labels untuk display
const fieldLabels = {
// KTP
'nik': 'NIK',
'nama': 'Nama',
'tempat_lahir': 'Tempat Lahir',
'tanggal_lahir': 'Tanggal Lahir',
'jenis_kelamin': 'Jenis Kelamin',
'gol_darah': 'Gol. Darah',
'alamat': 'Alamat',
'rt_rw': 'RT/RW',
'kel_desa': 'Kel/Desa',
'kecamatan': 'Kecamatan',
'agama': 'Agama',
'status_perkawinan': 'Status Perkawinan',
'pekerjaan': 'Pekerjaan',
'kewarganegaraan': 'Kewarganegaraan',
'berlaku_hingga': 'Berlaku Hingga',
'provinsi': 'Provinsi',
'kabupaten_kota': 'Kabupaten/Kota',
'tanggal_penerbitan': 'Tanggal Penerbitan',
// KK
'no_kk': 'No. KK',
'nama_kepala_keluarga': 'Kepala Keluarga',
'kode_pos': 'Kode Pos',
'anggota_keluarga': 'Jumlah Anggota'
};
// Doc type selection
docBtns.forEach(btn => {
btn.addEventListener('click', () => {
docBtns.forEach(b => b.classList.remove('active'));
btn.classList.add('active');
docType = btn.dataset.type;
});
});
// Drag & drop
dropzone.addEventListener('dragover', (e) => {
e.preventDefault();
dropzone.classList.add('dragover');
});
dropzone.addEventListener('dragleave', () => {
dropzone.classList.remove('dragover');
});
dropzone.addEventListener('drop', (e) => {
e.preventDefault();
dropzone.classList.remove('dragover');
const files = e.dataTransfer.files;
if (files.length > 0) {
handleFile(files[0]);
}
});
// File input
fileInput.addEventListener('change', (e) => {
if (e.target.files.length > 0) {
handleFile(e.target.files[0]);
}
});
// Click on dropzone
dropzone.addEventListener('click', (e) => {
if (e.target === dropzone || e.target.closest('.dropzone-content')) {
fileInput.click();
}
});
function handleFile(file) {
if (!file.type.startsWith('image/')) {
showError('File harus berupa gambar');
return;
}
if (file.size > 16 * 1024 * 1024) {
showError('Ukuran file maksimal 16MB');
return;
}
selectedFile = file;
// Show preview
const reader = new FileReader();
reader.onload = (e) => {
preview.src = e.target.result;
preview.style.display = 'block';
dropzone.querySelector('.dropzone-content').style.display = 'none';
};
reader.readAsDataURL(file);
processBtn.disabled = false;
hideError();
resultsSection.style.display = 'none';
}
// Process button
processBtn.addEventListener('click', async () => {
if (!selectedFile) return;
const btnText = processBtn.querySelector('.btn-text');
const btnLoading = processBtn.querySelector('.btn-loading');
processBtn.disabled = true;
btnText.style.display = 'none';
btnLoading.style.display = 'inline';
try {
const formData = new FormData();
formData.append('file', selectedFile);
formData.append('doc_type', docType);
const response = await fetch('/upload', {
method: 'POST',
body: formData
});
const result = await response.json();
if (result.success) {
extractedData = result.data;
displayResults(result);
hideError();
} else {
showError(result.error);
resultsSection.style.display = 'none';
}
} catch (error) {
showError('Terjadi kesalahan: ' + error.message);
} finally {
processBtn.disabled = false;
btnText.style.display = 'inline';
btnLoading.style.display = 'none';
}
});
// Region fields that use dropdowns - in hierarchical order
const regionFields = ['provinsi', 'kabupaten_kota', 'kecamatan', 'kel_desa'];
let regionData = {
provinces: [],
regencies: {},
districts: {},
villages: {}
};
let validationResult = null;
// Define field display order
const fieldOrder = [
// Location hierarchy first
'provinsi', 'kabupaten_kota', 'kecamatan', 'kel_desa',
// Identity
'nik', 'nama', 'tempat_lahir', 'tanggal_lahir', 'jenis_kelamin', 'gol_darah',
// Address
'alamat', 'rt_rw',
// Other info
'agama', 'status_perkawinan', 'pekerjaan', 'kewarganegaraan', 'berlaku_hingga',
// Issue date
'tanggal_penerbitan',
// KK specific
'no_kk', 'nama_kepala_keluarga', 'kode_pos', 'anggota_keluarga'
];
async function displayResults(result) {
resultsBody.innerHTML = '';
const data = result.data;
extractedData = data;
// Validate region data first
await validateRegionData(data);
// Sort keys by fieldOrder
const sortedKeys = Object.keys(data).sort((a, b) => {
const indexA = fieldOrder.indexOf(a);
const indexB = fieldOrder.indexOf(b);
if (indexA === -1 && indexB === -1) return 0;
if (indexA === -1) return 1;
if (indexB === -1) return -1;
return indexA - indexB;
});
for (const key of sortedKeys) {
const value = data[key];
if (key === 'anggota_keluarga') {
const count = Array.isArray(value) ? value.length : 0;
addResultRow('Jumlah Anggota', count + ' orang', null, false);
} else if (regionFields.includes(key)) {
// Region field with dropdown
const label = fieldLabels[key] || key;
await addRegionRow(label, value || '', key);
} else {
const label = fieldLabels[key] || key;
addResultRow(label, value || '', key, true);
}
}
rawText.textContent = result.raw_text;
resultsSection.style.display = 'block';
resultsSection.scrollIntoView({ behavior: 'smooth' });
}
async function validateRegionData(data) {
try {
const response = await fetch('/api/validate-region', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(data)
});
const result = await response.json();
if (result.success) {
validationResult = result.validation;
}
} catch (e) {
console.error('Validation error:', e);
}
}
async function addRegionRow(label, value, key) {
const row = document.createElement('tr');
const validation = validationResult?.[key];
const isValid = validation?.valid;
const suggestion = validation?.suggestion;
// Status indicator
const statusIcon = isValid ? '✓' : (value ? '⚠' : '');
const statusClass = isValid ? 'valid-field' : (value ? 'invalid-field' : '');
row.innerHTML = `
<td class="field-label">
${label}
<span class="validation-status ${statusClass}">${statusIcon}</span>
</td>
<td class="field-value">
<div class="region-field-wrapper">
<input type="text" class="editable-field ${statusClass}" data-key="${key}"
value="${suggestion || value || ''}" placeholder="Ketik atau pilih...">
<select class="region-dropdown" data-key="${key}" style="display: none;">
<option value="">-- Pilih --</option>
</select>
<button type="button" class="dropdown-toggle" data-key="${key}" title="Pilih dari daftar">▼</button>
</div>
${suggestion && suggestion !== value ? `<div class="suggestion-text">Saran: ${suggestion}</div>` : ''}
</td>
`;
const input = row.querySelector('input');
const select = row.querySelector('select');
const toggleBtn = row.querySelector('.dropdown-toggle');
// Input change
input.addEventListener('input', (e) => {
if (extractedData) {
extractedData[key] = e.target.value;
}
});
// Toggle dropdown
toggleBtn.addEventListener('click', async () => {
if (select.style.display === 'none') {
await loadDropdownOptions(key, select);
select.style.display = 'block';
input.style.display = 'none';
} else {
select.style.display = 'none';
input.style.display = 'block';
}
});
// Select change
select.addEventListener('change', (e) => {
const selectedOption = e.target.options[e.target.selectedIndex];
const selectedCode = selectedOption.value;
const selectedName = selectedOption.text !== '-- Pilih --' ? selectedOption.text : '';
input.value = selectedName;
if (extractedData) {
extractedData[key] = selectedName;
}
// Update validation result with selected code for cascading
if (!validationResult) validationResult = {};
validationResult[key] = {
valid: !!selectedCode,
code: selectedCode,
suggestion: selectedName
};
select.style.display = 'none';
input.style.display = 'block';
// Change toggle button to checkmark if valid selection
if (selectedCode) {
toggleBtn.textContent = '✓';
toggleBtn.classList.add('confirmed');
input.classList.remove('invalid-field');
input.classList.add('valid-field');
} else {
toggleBtn.textContent = '▼';
toggleBtn.classList.remove('confirmed');
}
// Clear dependent fields and their codes
clearDependentFields(key);
});
resultsBody.appendChild(row);
}
async function loadDropdownOptions(key, select) {
select.innerHTML = '<option value="">Loading...</option>';
try {
let data = [];
if (key === 'provinsi') {
if (!regionData.provinces.length) {
const res = await fetch('/api/provinces');
const json = await res.json();
regionData.provinces = json.data || [];
}
data = regionData.provinces;
} else if (key === 'kabupaten_kota') {
const provCode = validationResult?.provinsi?.code;
if (provCode) {
if (!regionData.regencies[provCode]) {
const res = await fetch(`/api/regencies/${provCode}`);
const json = await res.json();
regionData.regencies[provCode] = json.data || [];
}
data = regionData.regencies[provCode];
}
} else if (key === 'kecamatan') {
const regCode = validationResult?.kabupaten_kota?.code;
if (regCode) {
if (!regionData.districts[regCode]) {
const res = await fetch(`/api/districts/${regCode}`);
const json = await res.json();
regionData.districts[regCode] = json.data || [];
}
data = regionData.districts[regCode];
}
} else if (key === 'kel_desa') {
const distCode = validationResult?.kecamatan?.code;
if (distCode) {
if (!regionData.villages[distCode]) {
const res = await fetch(`/api/villages/${distCode}`);
const json = await res.json();
regionData.villages[distCode] = json.data || [];
}
data = regionData.villages[distCode];
}
}
select.innerHTML = '<option value="">-- Pilih --</option>';
data.forEach(item => {
const option = document.createElement('option');
option.value = item.code;
option.textContent = item.name;
select.appendChild(option);
});
} catch (e) {
select.innerHTML = '<option value="">Error loading data</option>';
}
}
function clearDependentFields(key) {
const dependents = {
'provinsi': ['kabupaten_kota', 'kecamatan', 'kel_desa'],
'kabupaten_kota': ['kecamatan', 'kel_desa'],
'kecamatan': ['kel_desa']
};
(dependents[key] || []).forEach(depKey => {
const input = document.querySelector(`input[data-key="${depKey}"]`);
if (input) input.value = '';
if (extractedData) extractedData[depKey] = '';
// Clear validation code for cascading
if (validationResult && validationResult[depKey]) {
validationResult[depKey] = { valid: false, code: null, suggestion: null };
}
});
}
function addResultRow(label, value, key, editable = true) {
const row = document.createElement('tr');
if (editable && key) {
row.innerHTML = `
<td class="field-label">${label}</td>
<td class="field-value">
<input type="text" class="editable-field" data-key="${key}" value="${value || ''}" placeholder="Klik untuk edit...">
</td>
`;
const input = row.querySelector('input');
input.addEventListener('input', (e) => {
if (extractedData && key) {
extractedData[key] = e.target.value;
}
});
} else {
row.innerHTML = `
<td class="field-label">${label}</td>
<td class="field-value">${value || '-'}</td>
`;
}
resultsBody.appendChild(row);
}
// Toggle raw text
document.getElementById('toggleRaw').addEventListener('click', () => {
const isVisible = rawTextSection.style.display !== 'none';
rawTextSection.style.display = isVisible ? 'none' : 'block';
});
// Copy to clipboard
document.getElementById('copyBtn').addEventListener('click', () => {
if (extractedData) {
navigator.clipboard.writeText(JSON.stringify(extractedData, null, 2))
.then(() => alert('Data berhasil disalin!'));
}
});
// Export JSON
document.getElementById('exportBtn').addEventListener('click', () => {
if (extractedData) {
const blob = new Blob([JSON.stringify(extractedData, null, 2)], { type: 'application/json' });
const url = URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = `${docType}_data.json`;
a.click();
URL.revokeObjectURL(url);
}
});
function showError(message) {
errorMessage.textContent = message;
errorSection.style.display = 'block';
}
function hideError() {
errorSection.style.display = 'none';
}
// Reset on new file selection
preview.addEventListener('click', () => {
preview.style.display = 'none';
dropzone.querySelector('.dropzone-content').style.display = 'flex';
selectedFile = null;
processBtn.disabled = true;
fileInput.value = '';
});
</script>
</body>
</html>