-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhandler.py
158 lines (141 loc) · 6.15 KB
/
handler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import os
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer, LTChar
import re
import json
import boto3
import json
# initializing s3 connection with boto client
AWS_ACCESS_KEY_ID = os.environ['aws_access_key_id']
AWS_SECRET_ACCESS_KEY = os.environ['aws_secret_access_key']
AWS_REGION = os.environ['region_name']
SQS_ULR = os.getenv('SQS_URL')
session = boto3.Session(
aws_access_key_id=AWS_ACCESS_KEY_ID,
aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
region_name=AWS_REGION,
)
# Creating S3 Resource From the Session.
client = session.client('s3')
SQS_CLIENT = boto3.client('sqs')
text_list = []
text_size = []
def find_between_r( s, first, last ):
try:
start = s.rindex( first ) + len( first )
end = s.rindex( last, start )
return s[start:end]
except ValueError:
return ""
def containsLetterAndNumber(input):
has_letter = False
has_number = False
for x in input:
if x.isalpha():
has_letter = True
elif x.isnumeric():
has_number = True
if has_letter or has_number:
return True
return False
def json_resume(event, context):
s3 = boto3.client("s3")
bucket_name = 'my-pdf-upload-bucket'
if event:
file_obj = event["Records"][0]
filename = str(file_obj["s3"]["object"]["key"])
if filename.split('.')[-1] == 'pdf':
tmp_filename = '/tmp/' + str(filename)
s3.download_file(bucket_name, filename, tmp_filename)
for page_layout in extract_pages(tmp_filename):
for element in page_layout:
if isinstance(element, LTTextContainer):
temp_text = element.get_text()
if(containsLetterAndNumber(temp_text)):
text_list.append(element.get_text())
for text_line in element:
for character in text_line:
if isinstance(character, LTChar):
text_size.append(character.size)
break
break
name = ''
skills = []
contact_number = 0
linkedin_url = ''
job_role = ''
current_location = ''
email = ''
summary = ''
experience = []
education = []
for idx, val in enumerate(text_size):
if(int(text_size[idx]) == 26):
name = text_list[idx].replace("\n", "")
job_role = text_list[idx+1].split("\n")[0]
current_location = text_list[idx+1].split("\n")[1]
if(text_list[idx]=='Summary\n'):
count = idx+1
while(text_size[idx+1] == text_size[count]):
summary= summary + text_list[count].replace("\n",'')
count = count+1
if(text_list[idx]=='Experience\n' and text_size[idx] == 15.75):
count = idx+1
while(text_size[idx] > text_size[count] and len(text_list)-1 > count):
temp_exp = text_list[count]
count = count+1
while(text_size[count] < 11 and len(text_list) > count):
temp_exp = temp_exp + text_list[count]
count = count+1
experience.append(temp_exp)
if(text_list[idx]=='Education\n' and text_size[idx] == 15.75):
count = idx+1
while(text_size[idx] > text_size[count] and len(text_list)-1 > count):
temp_edu = text_list[count]
count = count+1
while(len(text_list)-1 > count and text_size[count] < 11):
temp_edu = temp_edu + text_list[count]
count = count+1
education.append(temp_edu)
if(text_list[idx]=='Contact\n'):
contact_number = re.sub('\D', '', text_list[idx+1])
email = text_list[idx+1].split('\n')[1]
if(text_list[idx] == 'Top Skills\n'):
skills.append(text_list[idx+1].replace("\n", ""))
skills.append(text_list[idx+2].replace("\n", ""))
skills.append(text_list[idx+3].replace("\n", ""))
if(text_list[idx].find("linkedin.com")!=-1):
# extracting the linked in url of a profile
unprocessed_linkedin = find_between_r(text_list[idx], "www.linkedin.com", "(LinkedIn)" )
linkedin_url = 'https://www.linkedin.com'+unprocessed_linkedin.replace("\n", "")
data = {
"linkedin_url":linkedin_url,
"name":name,
"email":email,
"contact_number":contact_number,
"current_location":current_location,
"job_role":job_role,
"summary":summary,
"skills":skills,
"experience":experience,
"eduction":education
}
data = json.dumps(data).encode('UTF-8')
json_file = filename.split('.')[-2]
json_file = json_file+'.json'
result = client.put_object(ACL='public-read',Body=data, Bucket='my-pdf-upload-bucket', Key=json_file)
s3_res = result.get('ResponseMetadata')
# To generate cloudwatch logs
if s3_res.get('HTTPStatusCode') == 200:
print('File Uploaded Successfully')
response = SQS_CLIENT.send_message(
QueueUrl=SQS_ULR,
MessageBody=filename
)
sqs_res = result.get('ResponseMetadata')
if(sqs_res.get('HTTPStatusCode')== 200):
print('Message Sent to Sqs Queue', filename)
else:
print('Failed to send Message to SQS')
else:
print('File Not Uploaded')