ktp.py

import os
from datetime import datetime
from string import (
    ascii_letters,
    ascii_uppercase,
    digits,
    )
from cv2 import (
    imread,
    cvtColor,
    COLOR_BGR2GRAY,
    GaussianBlur,
    threshold,
    THRESH_BINARY,
    )
from pytesseract import image_to_string
from jaccard_index.jaccard import jaccard_index
from . import ocr as base_ocr


THIS_DIR = os.path.split(__file__)[0]

LABELS = (
    'NIK', 'Nama', 'Tempat/Tgl Lahir', 'Jenis', 'Alamat', 'RT/RW',
    'Kel/Desa', 'Kecamatan', 'Agama', 'Status', 'Pekerjaan',
    'Kewarganegaraan')

VALUES = dict(
    Jenis=('LAKI-LAKI', 'PEREMPUAN'),
    Agama=(
        'ISLAM', 'KRISTEN', 'KATOLIK', 'BUDHA', 'HINDU', 'KONGHUCHU',
        'KEPERCAYAAN'),
    Status=('KAWIN', 'BELUM'),
    Kewarganegaraan=('WNI', 'WNA'))

NUMBERS = ('NIK', 'RT/RW')

digits_letters = digits + ascii_letters


def is_date(s):
    try:
        d = datetime.strptime(s, '%d-%m-%Y')
    except ValueError:
        return
    return d.date()


def clean(s, chars=ascii_letters):
    r = ''
    i = -1
    for ch in s:
        i += 1
        if ch in chars:
            r = s[i:]
            break
    while r:
        if r[-1] in chars:
            return r
        r = r[:-1]
    return r


def match_value(values, ref_values):
    last_j_index = 0
    found = None
    for value in values:
        if not value[2:]:
            continue
        for ref_value in ref_values:
            j_index = jaccard_index(value, ref_value)
            if j_index < 0.3:
                continue
            if last_j_index < j_index:
                last_j_index = j_index
                found = ref_value
    return found


def nik(v):
    d = dict()
    v = clean(v, digits)
    if len(v) != 16:
        return d
    for ch in v:
        if ch not in digits:
            return d
    return dict(NIK=v)


def rt_rw(v):
    d = dict()
    t = v.split('/')
    if t[0]:
        v = clean(t[0], digits)
        if v:
            d['RT'] = v
    if t[1:] and t[1]:
        v = clean(t[1], digits)
        if v:
            d['RW'] = v
    return d


def tempat_tgl_lahir(v):
    d = dict()
    t = v.split()
    tgl = is_date(t[-1])
    if tgl:
        d['Tgl_Lahir'] = tgl
    tempat = ' '.join(t[:-1])
    tempat = clean(tempat)
    if tempat:
        tempat = tempat.lstrip('Lahir')
        d['Tempat_Lahir'] = clean(tempat)
    return d


def pekerjaan(v):
    v = clean(v)
    return dict(Pekerjaan=v)


PARSER = {
    'RT/RW': rt_rw,
    'Tempat/Tgl Lahir': tempat_tgl_lahir,
    'NIK': nik,
    'Pekerjaan': pekerjaan}


def parse(s):
    d = dict()
    for line in s.splitlines():
        line = clean(line, digits_letters)
        if not line:
            continue
        values = line.split()
        label = match_value(values[:1], LABELS)
        if not label:
            continue
        values = values[1:]
        if label in VALUES:
            v = match_value(values, VALUES[label])
            if v:
                d[label] = v
        else:
            v = ' '.join(values)
            func = PARSER.get(label)
            if func:
                d.update(func(v))
            else:
                v = clean(v, ascii_uppercase)
                d[label] = v
    return d


def ocr(img, lang_dir='data'):
    fullpath = os.path.join(THIS_DIR, lang_dir)
    tess_opt = '--psm 6 --tessdata-dir ' + fullpath
    s = base_ocr(img, 'ind', tess_opt)
    return parse(s)