Skip to content

Commit

Permalink
Support RTF and add unit test
Browse files Browse the repository at this point in the history
  • Loading branch information
louisjdmartin committed Jul 4, 2019
1 parent 1130cef commit d2ad9c9
Show file tree
Hide file tree
Showing 12 changed files with 173 additions and 49 deletions.
10 changes: 10 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
Poly14/
Poly18/
.idea
__pycache__
venv
TESTCSV/
TESTXLSX.xlsx
export/
export_csv/
export_html/
9 changes: 9 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
language: python
python:
- "3.7"

install:
- pip install -r requirements.txt

script:
- pytest
94 changes: 48 additions & 46 deletions HTMLReader.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,52 +29,53 @@

# __________________ Definition de classes ___________________
class HTMLReader(object):

""" Nom de la classe: HTMLReader
Description: Cette classe lit un fichier HTML et permet
l'extraction des tableaux qu'il contient """

def __init__(self, filename, version=18):
self.__filename = filename
self.__file = open(filename, "r")
self._file = open(filename, "r")
self.__version = version
self.__lines = None
self._lines = None
# Structure de cette variable definie dans la doctstring de get_all_tables
self.__tables = dict()
print("Reading report for polyspace 20" + str(version))

def __read_lines(self):
def _read_lines(self):
""" Converti un fichier HTML en MarkDown pour le rendre plus lisible """

# Prise en compte des specificites de chaque version de polyspace
# Une premiere conversion en Markdown est faite, le caractere | peux poser quelques soucis dans l'interpretation
if self.__version == 18:
self.__lines = "".join(self.__file.readlines()) \
self._lines = "".join(self._file.readlines()) \
.replace(PIPE, PIPE_ESCAPED) \
.replace("<td><p>", "<td>").replace("</p></td>", "</td>")
else:
self.__lines = "".join(self.__file.readlines()) \
.replace("|", "PIPE")
self._lines = "".join(self._file.readlines()) \
.replace(PIPE, PIPE_ESCAPED)

read_html = HTML2Text()
read_html.ignore_links = True
read_html.ignore_tables = False
read_html.pad_tables = True if self.__version == 18 else False
# Evite les retour a la ligne trop frequent
read_html.body_width = 100000
self.__lines = read_html.handle(self.__lines).split("\n")
self._lines = read_html.handle(self._lines).split("\n")

def __detect_begin_table(self, line):
def _detect_begin_table(self, line):
""" Verifie dans le markdown si on trouve le debut d'un tableau """
line += 1
# Recherche ligne vide suivie d'une ligne non vide
while line + 1 < len(self.__lines) \
and self.__lines[line].strip() \
and not self.__lines[line + 1].strip():
while line + 1 < len(self._lines) \
and self._lines[line].strip() \
and not self._lines[line + 1].strip():
line += 1

# On considere un debut de tableau si on trouve les headers de celui-ci
is_begin_table = bool(line + 2 < len(self.__lines)
and SEPARATOR_HEADER_LINE in self.__lines[line + 2])
is_begin_table = bool(line + 2 < len(self._lines)
and SEPARATOR_HEADER_LINE in self._lines[line + 2])
return is_begin_table

@staticmethod
Expand All @@ -92,48 +93,48 @@ def __remove_subcell(append_to_table):

def __is_not_begin_chapter(self, line):
# IMPROVEMENT: Une expression reguliere pourrait ameliorer la precision de la recherche
return CHAR_BEGIN_CHAPTER not in self.__lines[line]
return CHAR_BEGIN_CHAPTER not in self._lines[line]

def __find_chapter(self, line):
while line < len(self.__lines) and self.__is_not_begin_chapter(line):
while line < len(self._lines) and self.__is_not_begin_chapter(line):
line += 1
return line

def __next_chapter_or_table(self, line, current_section):
while line < len(self.__lines) \
and not self.__detect_begin_table(line) \
while line < len(self._lines) \
and not self._detect_begin_table(line) \
and self.__is_not_begin_chapter(line):
if self.__lines[line].startswith(CHAR_SECTION):
current_section = self.__lines[line][len(CHAR_SECTION):]
if self._lines[line].startswith(CHAR_SECTION):
current_section = self._lines[line][len(CHAR_SECTION):]
line += 1
return line, current_section

def __read_row(self, line, number_of_cols):
append_to_table = self.__lines[line]
append_to_table = self._lines[line]

# On peux avoir des retours a la ligne dans une cellule, d'ou la boucle
while line + 1 < len(self.__lines) and len(
while line + 1 < len(self._lines) and len(
self.__split_line(append_to_table)) < number_of_cols:

line += 1
append_to_table += "\n" + self.__lines[line]
append_to_table += "\n" + self._lines[line]

# En terme de syntaxe HTML on a parfois deux celules en une, dans le rendu
# cela ne se traduit par un simple retour a la ligne
if SEPARATOR_SUB_CELL in append_to_table:
append_to_table = self.__remove_subcell(append_to_table)
self.__lines[line + 1] += "|"
self._lines[line + 1] += "|"

# Sous cellule trouve sur la derniere cellule de la ligne d'un tableau
if self.__lines[line + 1].strip() == SEPARATOR_SUB_CELL:
if self._lines[line + 1].strip() == SEPARATOR_SUB_CELL:
# Dans ce cas, tant qu'on trouve pas la ligne suivante d'un tableau on considere
# le contenu comme etant de la cellule (il peux y avoir des retours a la ligne
# dans cette cellule)
while line + 1 < len(self.__lines) \
and self.__lines[line + 1].strip() \
and SEPARATOR_CELL not in self.__lines[line + 1]:
while line + 1 < len(self._lines) \
and self._lines[line + 1].strip() \
and SEPARATOR_CELL not in self._lines[line + 1]:
line += 1
append_to_table += "\n" + self.__lines[line]
append_to_table += "\n" + self._lines[line]
if SEPARATOR_SUB_CELL in append_to_table:
append_to_table = self.__remove_subcell(append_to_table)
return line, append_to_table
Expand All @@ -146,21 +147,22 @@ def __convert_to_output(self, datas):

def __get_header_line(self, line):
# La premiere ligne donnes les entetes, parfois il y a des retours a la ligne dans ces entetes
header_line = self.__lines[line]
while line < len(self.__lines) \
and SEPARATOR_HEADER_LINE not in self.__lines[line]:
header_line = self._lines[line]
while line < len(self._lines) \
and SEPARATOR_HEADER_LINE not in self._lines[line]:
line += 1
if SEPARATOR_HEADER_LINE not in self.__lines[line]:
header_line += self.__lines[line]
if SEPARATOR_HEADER_LINE not in self._lines[line]:
header_line += self._lines[line]
line += 1
return line, header_line

def _detect_end_table(self, line):
return not(self.__version != 14 and self._lines[line + 1].strip() or self._lines[line].strip())

def __extract_table(self, line, table, number_of_cols):
# Un tableau se termine par 2 lignes vide (une seule dans polyspace 14...)
while line < len(self.__lines) \
and (self.__version != 14
and self.__lines[line + 1].strip()
or self.__lines[line].strip()):
while line < len(self._lines) \
and not self._detect_end_table(line):
(line, append_to_table) = self.__read_row(line, number_of_cols)
table["table"].append(self.__convert_to_output(append_to_table))
line += 1
Expand All @@ -172,11 +174,11 @@ def __save_table(self, line, chapter, current_section):
header_line = ""

# Si on trouve un tableau
if line < len(self.__lines) and self.__detect_begin_table(line):
if line < len(self._lines) and self._detect_begin_table(line):
# Initialisation de la sauvegarde
line += 1
table = {
"name": self.__lines[line],
"name": self._lines[line],
"section": current_section,
"table": []
}
Expand Down Expand Up @@ -205,20 +207,20 @@ def __read_tables(self):
chapter = None
current_section = None

while line < len(self.__lines):
while line < len(self._lines):
# ETAPE 1: Recherche de chapitre
line = self.__find_chapter(line)

# Chapitre trouve
if line < len(self.__lines):
if line < len(self._lines):
# Enregistrement des infos generales sur le chapitre et initialisations.
chapter = self.__lines[line]
chapter = self._lines[line]
self.__tables[chapter] = []
current_section = None
line += 1

# Tant qu'on ne change pas de chapitre ou que l'on atteint pas la fin du fichier...
while line < len(self.__lines) and self.__is_not_begin_chapter(line):
while line < len(self._lines) and self.__is_not_begin_chapter(line):
# On cherche la fin du chapitre ou un tableau
(line, current_section) = self.__next_chapter_or_table(line, current_section)
line = self.__save_table(line, chapter, current_section)
Expand All @@ -241,11 +243,11 @@ def get_all_tables(self):
...
]
"""
if self.__lines is None:
self.__read_lines()
if self._lines is None:
self._read_lines()
if len(self.__tables.keys()) == 0:
self.__read_tables()
return self.__tables

def close(self):
self.__file.close()
self._file.close()
8 changes: 6 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,16 @@ to read a polyspace report and export misra or run-time results in
an excel file for easier analyse.

## Installation
You need to install XlsxWriter and html2text, you can use pip for that:
You need to install XlsxWriter, striprtf and html2text, you can use pip for that:

```
pip install XlsxWriter
pip install html2text
pip install striprtf
```

## Usage
Export polyspace data in html then run export-all.py, you can use `--help`.
Export polyspace data in an HTML report or RTF report then run export-all.py, you can use `--help`.

Script was made for polyspace 2018 but you can run it with polyspace 2014
by adding `--poly14` argument.
Expand All @@ -23,6 +24,9 @@ Use arguments to filter what you need.

```
python export-all.py input.html ouput-folder/
# OR
## WARNING, by converting RTF file you must precise --runtime or --misra
python export-all.py input.rtf ouput-folder/ --runtime
```
## Arguments
- `--misra` will export misra report only.
Expand Down
56 changes: 56 additions & 0 deletions RTFReader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#!/usr/bin/env python3

""" Nom du module: RTFReader
Description: Ce module permet la lecture d'un RTF et l'extraction des
tableaux qu'il contient.
Version: 1
Date: 9 avril 2019
Auteur: Louis MARTIN
Methodes et classes publiques:
- HTMLReader
__init__(filename, version=18)
get_all_tables()
"""

# __________________________ IMPORT __________________________
import HTMLReader
from striprtf.striprtf import rtf_to_text

# ________________________ CONSTANTES ________________________

HTMLReader.CHAR_BEGIN_CHAPTER = "Chapter"
HTMLReader.CHAR_BEGIN_TABLE = "Table"
HTMLReader.CHAR_SECTION = "## "
HTMLReader.SEPARATOR_CELL = "|"
HTMLReader.SEPARATOR_HEADER_LINE = "|"
HTMLReader.SEPARATOR_SUB_CELL = "NO_SUBCELL_POSSIBLE"
HTMLReader.PIPE = "|"
HTMLReader.PIPE_ESCAPED = "PIPE"


# __________________ Definition de classes ___________________
class RTFReader(HTMLReader.HTMLReader):
def __init__(self, filename, version=None):
super().__init__(filename, version=version)

def _read_lines(self):
lines = ""
for line in self._file.readlines():
lines = lines + line + "\n"





lines = lines.replace(HTMLReader.PIPE, HTMLReader.PIPE_ESCAPED)

self._lines = rtf_to_text(lines).split("\n")




def _detect_end_table(self, line):
return not self._lines[line].strip()

def _detect_begin_table(self, line):
return line+1<len(self._lines) and HTMLReader.CHAR_BEGIN_TABLE in self._lines[line+1]
Empty file added __init__.py
Empty file.
10 changes: 9 additions & 1 deletion export-all.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,15 @@
exit()

# Lecture du rapport Polyspace
reader = HTMLReader(input_file, version=polyversion)
if ".rtf" in input_file:
from RTFReader import RTFReader
if not (misra or runtime):
print("With rtf file you should precise if you want to read MISRA or Runtime!")
exit()
reader = RTFReader(input_file, version=polyversion)
else:
reader = HTMLReader(input_file, version=polyversion)

tables = reader.get_all_tables()
reader.close()

Expand Down
1 change: 1 addition & 0 deletions exportcsv.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ def create_synthese(self, is_misra=False):
csvPath = os.path.join(self.output, "Synthese.csv")
synth_sheet = open(csvPath, "w", newline = "")
csvWriter = self.__getWriter(synth_sheet)
line = 0

for sheet in self.sheets.values():
if is_misra and "by file" not in sheet[3].lower() and sheet[3].strip() or not is_misra:
Expand Down
3 changes: 3 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
XlsxWriter
html2text
striprtf
2 changes: 2 additions & 0 deletions test/test_HTMLReader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
def test():
assert(True)
16 changes: 16 additions & 0 deletions test/test_csvexporter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import os,sys
sys.path.insert(0, os.path.abspath("."))
sys.path.insert(0, os.path.abspath(".."))
print(sys.path)

from exportcsv import Exportcsv

def test_export():
exporter = Exportcsv("TESTCSV")
data = [["A1", "A2", "A3"], ["B1", "B2", "B3"]]
exporter.add_sheet("EXPORT", data)
assert(os.path.exists("TESTCSV/EXPORT.csv"))

export = open("TESTCSV/EXPORT.csv");
csv = [line.replace("\n", "").split(";") for line in export.readlines()]
assert(data == csv)
13 changes: 13 additions & 0 deletions test/test_xlsxexporter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import os,sys
sys.path.insert(0, os.path.abspath("."))
sys.path.insert(0, os.path.abspath(".."))
print(sys.path)

from exportxlsx import Exportxlsx

def test_export():
exporter = Exportxlsx("TESTXLSX")
data = [["A1", "A2", "A3"], ["B1", "B2", "B3"]]
exporter.add_sheet("EXPORT", data)
exporter.export()
assert(os.path.exists("TESTXLSX.xlsx"))

0 comments on commit d2ad9c9

Please sign in to comment.