-
Notifications
You must be signed in to change notification settings - Fork 0
/
modifyTextsOdlomki.py
83 lines (72 loc) · 2.79 KB
/
modifyTextsOdlomki.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import csv
import re
inMapa = "./textsOriginal/"
outMapa = "./textsModiedOdlomki/"
TOTAL_WORDS = 400
STEVILO_ODLOMKOV = 20
vrstniRed = ["Adventure","Fantasy","Science Fiction"]
global zapStevilka
zapStevilka = 1
def fileModifier(fn, shelf):
global zapStevilka
inPath = inMapa + shelf + "/" + fn + ".txt"
#outPath = outMapa + fn + "_M.txt"
outPath = outMapa + str(zapStevilka).zfill(3) + ".txt"
#TODO odstrani zacetek in konec
zacIdx = 100
konIdx = 0
imamoKonec = False
with open(inPath, 'r', encoding="utf8") as inputFile:
i = 0
for line in inputFile.readlines():
if (bool(re.search(".*start of .*project gutenberg.*", line.lower()))):
zacIdx=i
#print(" Start OF PR ", zacIdx )
if (not imamoKonec and bool(re.search(".*end of .*project gutenberg.*", line.lower()))):
konIdx=i
imamoKonec = True
#print(" END OF PR ", konIdx )
#break
i+=1
zacIdx += 100
if(zacIdx>500):
print("PREVERI!!!!!!!!!")
zacIdx = 500;
if(not imamoKonec):
konIdx = i
#print(" Zac idx:", zacIdx, "Kon idx", konIdx, 'Dolzina celega texta', i)
korakZaNaslednjiOdlomek= (konIdx-zacIdx)//STEVILO_ODLOMKOV;
#print(" Korak", korakZaNaslednjiOdlomek)
if (korakZaNaslednjiOdlomek < 23):
print("PREMAJHEN KORAK ", fn, korakZaNaslednjiOdlomek, zacIdx, konIdx, "!!!!!!!!!")
#Dobimo število vseh vrstic, da vemo na katerih mestih kasneje vzamemo odlomke
# with open(inPath, 'r', encoding="utf8") as inputFile:
# korakZaNaslednjiOdlomek= len(inputFile.readlines())//STEVILO_ODLOMKOV;
with open(inPath, 'r', encoding="utf8") as inputFile:
lines = inputFile.readlines()
i = 0
while i < STEVILO_ODLOMKOV:
outPath = outMapa + str(zapStevilka).zfill(3) + str(i+1).zfill(2) + ".txt"
with open(outPath, 'w+', encoding="utf8") as outFile:
odlomek = ""
l = zacIdx + i*korakZaNaslednjiOdlomek
wc = 0
#print(i)
while wc < TOTAL_WORDS:
line = lines[l]
wc += len(line.split(" "))
line = line.strip()
odlomek += line + " "
#print(" "+ str(wc))
l += 1
outFile.write(odlomek+"\n")
i+=1
zapStevilka +=1
with open('textiFiction.csv', newline='') as csvfile:
texts = csv.reader(csvfile, delimiter=',', dialect="excel")
glava = next(texts)
for book in texts:
ime = book[1]
shelf = book[4]
#print(ime, shelf)
fileModifier(ime, shelf)