-
Notifications
You must be signed in to change notification settings - Fork 0
/
file_reader.py
53 lines (47 loc) · 1.38 KB
/
file_reader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Mon Oct 31 20:40:50 2016
"""
import re
import xml.etree.ElementTree as ET
import os
def read_xml_file( path_to_file ):
read_handle = open( path_to_file,"rb")
#
# Get rid of the first line before parsing as xml
# If we are planning to use the first line for ranking answers, we can
#store the first line and return first line along with the sentence
#
read_handle.readline()
try :
tree = ET.parse(read_handle)
root = tree.getroot()
for child in root.iter('TEXT') :
sentence = child.text
return sentence
except:
print path_to_file
return ''
def read_file_text(path_to_file):
read_handle = open( path_to_file,"r")
score = read_handle.readline()
while not score.split():
score = read_handle.readline()
# If we want to remove everything till <TEXT> in anwer doc
# uncomment this.
'''
tags = read_handle.readline()
while "<TEXT>" not in tags :
tags = read_handle.readline()
'''
text = read_handle.read()
text = re.sub('<[^>]*>', '', text)
return (score,text)
def list_all_files( path ):
all_files = os.listdir(path);
all_text_files = [];
for file_name in all_files:
if (file_name.isdigit()):
all_text_files.append(int(file_name));
return (all_text_files)