forked from 0dust/ResumeFilter
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdl_parser.py
151 lines (131 loc) · 6.5 KB
/
dl_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Aug 26 16:04:23 2018
@author: himanshu
"""
import os
import sys
import pandas as pd
from classifier.lstm import BidirectionalLstm
from utils.model_essentials import essentials
from utils.load_training_data import load_final_data
from utils.parsing_rules import *
from keras.preprocessing.text import text_to_word_sequence
line_labels = {0: 'experience', 1: 'knowledge', 2: 'education', 3: 'project', 4: 'others'}
line_types = {0: 'header', 1: 'meta', 2: 'content'}
class ResumeParser():
def __init__(self):
self.email = None
self.gender = None
self.education = []
self.experience = []
self.project = []
self.meta = []
self.header = []
self.skills = []
self.line_label_classifier = BidirectionalLstm()
self.line_type_classifier = BidirectionalLstm()
def load_model(self, model_dir_path):
self.line_label_classifier.load_model(model_dir_path=os.path.join(model_dir_path, 'label'))
self.line_type_classifier.load_model(model_dir_path=os.path.join(model_dir_path, 'type'))
def fit(self, training_data_dir_path, model_dir_path, batch_size = 128, epochs = 10,
train_test_split_ratio= 0.3,random_state = 2018,dropout_rate = None,use_pretrained_embedd = False,embedding_size = None):
line_label_fitted_model = self.fit_line_label_classifier(training_data_dir_path, model_dir_path,
batch_size, epochs, train_test_split_ratio,random_state,
dropout_rate,use_pretrained_embedd, embedding_size)
line_type_fitted_model = self.fit_line_type_classifier(training_data_dir_path, model_dir_path, batch_size, epochs,
train_test_split_ratio, random_state,
dropout_rate,use_pretrained_embedd, embedding_size)
all_fitted_models = [line_label_fitted_model,line_type_fitted_model]
return all_fitted_models
def fit_line_label_classifier(self,training_data_dir_path, model_dir_path,batch_size,epochs,train_test_split_ratio,
random_state,dropout_rate, use_pretrained_embedd,embedding_size):
model_essentials_dict = essentials(training_data_dir_path,label_column_name = 'label')
final_training_data = load_final_data(training_data_dir_path, label_column_name = 'label')
print('###################### Training for line label ###########################')
fitted_model = self.line_label_classifier.fit(model_dir_path+'/label', model_essentials_dict,
final_training_data,batch_size,epochs,
train_test_split_ratio, random_state ,dropout_rate,
use_pretrained_embedd,embedding_size)
return fitted_model
def fit_line_type_classifier(self,training_data_dir_path, model_dir_path, batch_size, epochs, train_test_split_ratio,
random_state,dropout_rate, use_pretrained_embedd, embedding_size):
model_essentials_dict = essentials(training_data_dir_path, label_column_name = 'type')
final_training_data = load_final_data(training_data_dir_path, label_column_name = 'type')
print('################### Training for label type #############################')
fitted_model = self.line_type_classifier.fit(model_dir_path+'/type', model_essentials_dict,
final_training_data,batch_size ,epochs,
train_test_split_ratio, random_state,dropout_rate,
use_pretrained_embedd,embedding_size)
return fitted_model
@staticmethod
def get_education(line_label,line):
if line_label == 'education':
return line
return None
@staticmethod
def get_experience(line_label,line):
if line_label == 'experience':
return line
return None
@staticmethod
def get_project(line_label,line):
if line_label == 'project':
return line
return None
@staticmethod
def get_skills(line_label,line):
if line_label == 'skill':
return line
return None
def parse(self,text):
self.raw_text = text
for line in text:
tokens = text_to_word_sequence(line)
line_label = self.line_label_classifier.predict_class(line)
line_type = self.line_type_classifier.predict_class(line)
# print(line_label)
# print(line_type)
email = get_email(line)
gender = get_gender(line)
education = self.get_education(line_label,line)
experience = self.get_experience(line_label,line)
project = self.get_project(line_label,line)
skill = self.get_skills(line_label,line)
if email is not None:
self.email = email
if gender is not None:
self.gender = gender
if education is not None:
self.education.append(education)
if experience is not None:
self.experience.append(experience)
if project is not None:
self.project.append(project)
if skill is not None:
self.skills.append(skill)
if line_type == 'meta':
self.meta.append(line)
if line_type == 'header':
self.header.append(line)
# print('-----------------parsed------------------')
def return_parsed_resume(self):
result = dict()
if self.email:
result['email'] = self.name
if self.gender:
result['gender'] = self.gender
if self.education:
result['education'] = self.education
if self.experience:
result['experience'] = self.experience
if self.project:
result['project'] = self.project
if self.skills:
result['skills'] = self.skills
if self.meta:
result['meta'] = self.meta
if self.header:
result['header'] = self.header
return result