-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpptxReader.py
49 lines (39 loc) · 1.6 KB
/
pptxReader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
from zipfile import ZipFile
from bs4 import BeautifulSoup
from glob import iglob
import re
class PowerPointReader:
_regex = re.compile(r".*?(\d+)\.xml") # regex to extract the slide number
_sorter = lambda x: int(PowerPointExtractor._regex.match(x).group(1))
def findall(self, fp):
return iglob("{}**/*.pptx".format(fp))
def build(self, target):
self.f = ZipFile(target)
def _getSlides(self):
slides = list(filter(lambda x: x.startswith("ppt/slides/") and x.endswith(".xml"),
self.f.namelist()))
return sorted(slides, key=lambda x: PowerPointExtractor._sorter(x))
#
# def __enter__(self):
# return self
#
# def __exit__(self, exc_type, exc_value, traceback):
# self.f.close()
# def readPptx(fp):
# with ZipFile(fp) as f:
# slides = list(filter(lambda x: x.startswith("ppt/slides/") and x.endswith(".xml"), f.namelist()))
# slides = sorted(slides, key=PowerPointExtractor._sorter)
# print(slides)
# with f.open(slides[5]) as f:
# bs = BeautifulSoup(f.read(), "lxml")
# print(bs.prettify())
# textBody = list(filter(lambda x: x.name == "p:txbody", bs.recursiveChildGenerator()))
# text = []
# for node in textBody:
# text.append(list(filter(lambda x: x.name == "a:t", node.recursiveChildGenerator())))
# temp = []
# for i in text:
# temp.extend(map(lambda x: x.text, i))
# return "".join(temp)
targets = [x for x in iglob("**/*.pptx", recursive=True)]
powerpoints = []