Commit e03cbcec by Owo Sugiana

Kali pertama

0 parents
*egg-info
__pycache__
0.1 2023-04-02
----------
- Kali pertama
include *.txt *.rst *.py
recursive-include ocr_ktp *.py *.traineddata
OCR KTP
=======
Optical Character Recognition adalah membaca foto yang berisi huruf, angka,
serta karakter lainnya. Paket ini memilah hasil pembacaan foto KTP
menjadi bentuk baku seperti NIK, Nama, Alamat, dst.
Pemasangan Tesseract
--------------------
Paket ini membutuhkan `Tesseract <https://github.com/tesseract-ocr/tesseract>`_.
Untuk Debian dan turunannya bisa dengan cara berikut::
$ sudo add-apt-repository ppa:alex-p/tesseract-ocr-devel
$ sudo apt update
$ sudo apt install -y tesseract-ocr
Setelah paket ini dipasang fotolah sebuah KTP dan coba::
$ cd contrib
$ ~/env/bin/python uji-ktp.py ktp.jpg
Jika NIK tidak sesuai - misalnya tidak 16 digit - maka tidak akan ditampilkan.
Hal lainnya yang tidak baku juga tidak akan tampak misalnya tanggal yang tidak
benar, RT / RW bukan angka, dst. Cobalah foto ulang, pastikan alas meja tidak
terlihat.
Semoga berhasil.
import sys
from argparse import ArgumentParser
from pprint import pprint
from ocr_ktp.ktp import ocr
lang_dir = '../ocr_ktp/data'
help_lang_dir = 'default ' + lang_dir
pars = ArgumentParser()
pars.add_argument('img_file')
pars.add_argument('--lang-dir', default=lang_dir, help=help_lang_dir)
option = pars.parse_args(sys.argv[1:])
d = ocr(option.img_file, option.lang_dir)
pprint(d)
from cv2 import (
imread,
cvtColor,
COLOR_BGR2GRAY,
GaussianBlur,
threshold,
THRESH_BINARY,
)
from pytesseract import image_to_string
def ocr(image, lang='eng', tess_opt='--psm 6'):
if isinstance(image, str):
image = imread(image) # Color
gray = cvtColor(image, COLOR_BGR2GRAY) # Grayscale
blurred = GaussianBlur(gray, (3, 3), 0) # Smoothing
ret, thresh = threshold(blurred, 127, 255, THRESH_BINARY) # Black & white
height, width, _ = image.shape
width = int(width*7/10)
cropped = thresh[0:height, 0:width] # Buang tanda tangan
return image_to_string((cropped), lang=lang, config=tess_opt)
This file is too large to display.
from datetime import datetime
from string import (
ascii_letters,
ascii_uppercase,
digits,
)
from cv2 import (
imread,
cvtColor,
COLOR_BGR2GRAY,
GaussianBlur,
threshold,
THRESH_BINARY,
)
from pytesseract import image_to_string
from jaccard_index.jaccard import jaccard_index
from . import ocr as base_ocr
LABELS = (
'NIK', 'Nama', 'Tempat/Tgl Lahir', 'Jenis', 'Alamat', 'RT/RW',
'Kel/Desa', 'Kecamatan', 'Agama', 'Status', 'Pekerjaan',
'Kewarganegaraan')
VALUES = dict(
Jenis=('LAKI-LAKI', 'PEREMPUAN'),
Agama=(
'ISLAM', 'KRISTEN', 'KATOLIK', 'BUDHA', 'HINDU', 'KONGHUCHU',
'KEPERCAYAAN'),
Status=('KAWIN', 'BELUM'),
Kewarganegaraan=('WNI', 'WNA'))
NUMBERS = ('NIK', 'RT/RW')
digits_letters = digits + ascii_letters
def is_date(s):
try:
d = datetime.strptime(s, '%d-%m-%Y')
except ValueError:
return
return d.date()
def clean(s, chars=ascii_letters):
r = ''
i = -1
for ch in s:
i += 1
if ch in chars:
r = s[i:]
break
while r:
if r[-1] in chars:
return r
r = r[:-1]
return r
def match_value(values, ref_values):
last_j_index = 0
found = None
for value in values:
if not value[2:]:
continue
for ref_value in ref_values:
j_index = jaccard_index(value, ref_value)
if j_index < 0.3:
continue
if last_j_index < j_index:
# print(f'Jaccard Index {value} vs {ref_value} = {j_index}')
last_j_index = j_index
found = ref_value
return found
def nik(v):
d = dict()
v = clean(v, digits)
if len(v) != 16:
return d
for ch in v:
if ch not in digits:
return d
return dict(NIK=v)
def rt_rw(v):
d = dict()
t = v.split('/')
if t[0]:
v = clean(t[0], digits)
if v:
d['RT'] = v
if t[1:] and t[1]:
v = clean(t[1], digits)
if v:
d['RW'] = v
return d
def tempat_tgl_lahir(v):
d = dict()
t = v.split()
tgl = is_date(t[-1])
if tgl:
d['Tgl_Lahir'] = tgl
tempat = ' '.join(t[:-1])
tempat = clean(tempat)
if tempat:
tempat = tempat.lstrip('Lahir')
d['Tempat_Lahir'] = clean(tempat)
return d
def pekerjaan(v):
v = clean(v)
return dict(Pekerjaan=v)
PARSER = {
'RT/RW': rt_rw,
'Tempat/Tgl Lahir': tempat_tgl_lahir,
'NIK': nik,
'Pekerjaan': pekerjaan}
def parse(s):
d = dict()
for line in s.splitlines():
line = clean(line, digits_letters)
if not line:
continue
values = line.split()
label = match_value(values[:1], LABELS)
if not label:
continue
values = values[1:]
if label in VALUES:
v = match_value(values, VALUES[label])
if v:
d[label] = v
else:
v = ' '.join(values)
func = PARSER.get(label)
if func:
d.update(func(v))
else:
v = clean(v, ascii_uppercase)
d[label] = v
return d
def ocr(img, lang_dir='data'):
tess_opt = '--psm 6 --tessdata-dir ' + lang_dir
s = base_ocr(img, 'ind', tess_opt)
return parse(s)
import os
import sys
import subprocess
from setuptools import (
setup,
find_packages,
)
here = os.path.abspath(os.path.dirname(__file__))
with open(os.path.join(here, 'README.rst')) as f:
README = f.read()
with open(os.path.join(here, 'CHANGES.txt')) as f:
CHANGES = f.read()
line = CHANGES.splitlines()[0]
version = line.split()[0]
requires = [
'PyTesseract',
'opencv-python',
'jaccard-index',
]
setup(
name='ocr-ktp',
version=version,
description='Menerjemahkan foto KTP ke dictionary',
long_description=README + '\n\n' + CHANGES,
classifiers=[
'Programming Language :: Python :: 3.6',
],
author='Owo Sugiana',
author_email='sugiana@gmail.com',
url='https://git.opensipkd.com/sugiana/ocr-ktp',
keywords='ocr ktp',
packages=find_packages(),
include_package_data=True,
zip_safe=False,
install_requires=requires,
)
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!