ktp.py
3.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import os
from datetime import datetime
from string import (
ascii_letters,
ascii_uppercase,
digits,
)
from cv2 import (
imread,
cvtColor,
COLOR_BGR2GRAY,
GaussianBlur,
threshold,
THRESH_BINARY,
)
from pytesseract import image_to_string
from jaccard_index.jaccard import jaccard_index
from . import ocr as base_ocr
THIS_DIR = os.path.split(__file__)[0]
LABELS = (
'NIK', 'Nama', 'Tempat/Tgl Lahir', 'Jenis', 'Alamat', 'RT/RW',
'Kel/Desa', 'Kecamatan', 'Agama', 'Status', 'Pekerjaan',
'Kewarganegaraan')
VALUES = dict(
Jenis=('LAKI-LAKI', 'PEREMPUAN'),
Agama=(
'ISLAM', 'KRISTEN', 'KATOLIK', 'BUDHA', 'HINDU', 'KONGHUCHU',
'KEPERCAYAAN'),
Status=('KAWIN', 'BELUM'),
Kewarganegaraan=('WNI', 'WNA'))
NUMBERS = ('NIK', 'RT/RW')
digits_letters = digits + ascii_letters
def is_date(s):
try:
d = datetime.strptime(s, '%d-%m-%Y')
except ValueError:
return
return d.date()
def clean(s, chars=ascii_letters):
r = ''
i = -1
for ch in s:
i += 1
if ch in chars:
r = s[i:]
break
while r:
if r[-1] in chars:
return r
r = r[:-1]
return r
def match_value(values, ref_values):
last_j_index = 0
found = None
for value in values:
if not value[2:]:
continue
for ref_value in ref_values:
j_index = jaccard_index(value, ref_value)
if j_index < 0.3:
continue
if last_j_index < j_index:
last_j_index = j_index
found = ref_value
return found
def nik(v):
d = dict()
v = clean(v, digits)
if len(v) != 16:
return d
for ch in v:
if ch not in digits:
return d
return dict(NIK=v)
def rt_rw(v):
d = dict()
t = v.split('/')
if t[0]:
v = clean(t[0], digits)
if v:
d['RT'] = v
if t[1:] and t[1]:
v = clean(t[1], digits)
if v:
d['RW'] = v
return d
def tempat_tgl_lahir(v):
d = dict()
t = v.split()
tgl = is_date(t[-1])
if tgl:
d['Tgl_Lahir'] = tgl
tempat = ' '.join(t[:-1])
tempat = clean(tempat)
if tempat:
tempat = tempat.lstrip('Lahir')
d['Tempat_Lahir'] = clean(tempat)
return d
def pekerjaan(v):
v = clean(v)
return dict(Pekerjaan=v)
PARSER = {
'RT/RW': rt_rw,
'Tempat/Tgl Lahir': tempat_tgl_lahir,
'NIK': nik,
'Pekerjaan': pekerjaan}
def parse(s):
d = dict()
for line in s.splitlines():
line = clean(line, digits_letters)
if not line:
continue
values = line.split()
label = match_value(values[:1], LABELS)
if not label:
continue
values = values[1:]
if label in VALUES:
v = match_value(values, VALUES[label])
if v:
d[label] = v
else:
v = ' '.join(values)
func = PARSER.get(label)
if func:
d.update(func(v))
else:
v = clean(v, ascii_uppercase)
d[label] = v
return d
def ocr(img, lang_dir='data'):
fullpath = os.path.join(THIS_DIR, lang_dir)
tess_opt = '--psm 6 --tessdata-dir ' + fullpath
s = base_ocr(img, 'ind', tess_opt)
return parse(s)