diff --git a/doc_src/_static/my_theme.css b/doc_src/_static/my_theme.css index 27f7efff..1f3cb655 100644 --- a/doc_src/_static/my_theme.css +++ b/doc_src/_static/my_theme.css @@ -24,3 +24,26 @@ padding: 0.2em; text-align: left; } + +table.super-collator { border-collapse: collapse } +table.super-collator th, +table.super-collator td.outer { border: 1px solid black } +table.super-collator td.inner { width: 50%; padding: 0 0.5ex; text-align: right } +table.super-collator td.prefilled { background-color: #eee } +table.super-collator td.em { font-weight: bold } +table.super-collator th { padding: 0 0.5ex; text-align: right } +table.super-collator table { width: 100% } +table.super-collator-result td { border: 1px solid black; padding: 0 1ex } + +table.super-collator { + margin-bottom: 1em; +} + +#super-collator-phase2 td.outer.bt { + background-color: #fefece; + border: 1.5pt solid #a80136; +} + +#super-collator-phase2 td.outer.bt .arrow { + color: #a80136; +} diff --git a/doc_src/_static/super-collator-phase1.html b/doc_src/_static/super-collator-phase1.html new file mode 100644 index 00000000..546ed9c4 --- /dev/null +++ b/doc_src/_static/super-collator-phase1.html @@ -0,0 +1,13 @@ +
sick | fox | is | crazy | ||||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
|
|
| |||||||||||||||||||||
the |
|
|
|
|
| ||||||||||||||||||||
quick |
|
|
|
|
| ||||||||||||||||||||
brown |
|
|
|
|
| ||||||||||||||||||||
fox |
|
|
|
|
| ||||||||||||||||||||
jumps |
|
|
|
|
| ||||||||||||||||||||
over |
|
|
|
|
| ||||||||||||||||||||
the |
|
|
|
|
| ||||||||||||||||||||
lazy |
|
|
|
|
| ||||||||||||||||||||
dog |
|
|
|
|
|
sick | fox | is | crazy | ||||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
|
|
| |||||||||||||||||||||
the |
|
|
|
|
| ||||||||||||||||||||
quick |
|
|
|
|
| ||||||||||||||||||||
brown |
|
|
|
|
| ||||||||||||||||||||
fox |
|
|
|
|
| ||||||||||||||||||||
jumps |
|
|
|
|
| ||||||||||||||||||||
over |
|
|
|
|
| ||||||||||||||||||||
the |
|
|
|
|
| ||||||||||||||||||||
lazy |
|
|
|
|
| ||||||||||||||||||||
dog |
|
|
|
|
|
the | quick | brown | fox | jumps | over | the | lazy | dog |
- | sick | - | fox | is | - | - | crazy | - |
+
|
+
+
|
+
sick | fox | is | crazy | ||||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
|
|
| |||||||||||||||||||||
the |
|
|
|
|
| ||||||||||||||||||||
quick |
|
|
|
|
| ||||||||||||||||||||
brown |
|
|
|
|
| ||||||||||||||||||||
fox |
|
|
|
|
| ||||||||||||||||||||
jumps |
|
|
|
|
| ||||||||||||||||||||
over |
|
|
|
|
| ||||||||||||||||||||
the |
|
|
|
|
| ||||||||||||||||||||
lazy |
|
|
|
|
| ||||||||||||||||||||
dog |
|
|
|
|
|
sick | fox | is | crazy | ||||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
|
|
| |||||||||||||||||||||
the |
|
|
|
|
| ||||||||||||||||||||
quick |
|
|
|
|
| ||||||||||||||||||||
brown |
|
|
|
|
| ||||||||||||||||||||
fox |
|
|
|
|
| ||||||||||||||||||||
jumps |
|
|
|
|
| ||||||||||||||||||||
over |
|
|
|
|
| ||||||||||||||||||||
the |
|
|
|
|
| ||||||||||||||||||||
lazy |
|
|
|
|
| ||||||||||||||||||||
dog |
|
|
|
|
|
the | quick | brown | fox | jumps | over | the | lazy | dog |
- | sick | - | fox | is | - | - | crazy | - |
© Copyright 2018-22 CCeH - Licensed under the GNU GPL v3 or later.
+© Copyright 2018-23 CCeH - Licensed under the GNU GPL v3 or later.
© Copyright 2018-22 CCeH - Licensed under the GNU GPL v3 or later.
+© Copyright 2018-23 CCeH - Licensed under the GNU GPL v3 or later.
© Copyright 2018-22 CCeH - Licensed under the GNU GPL v3 or later.
+© Copyright 2018-23 CCeH - Licensed under the GNU GPL v3 or later.
Built with Sphinx using a diff --git a/docs/maintenance.html b/docs/maintenance.html index 0083459a..61d557fd 100644 --- a/docs/maintenance.html +++ b/docs/maintenance.html @@ -185,7 +185,7 @@© Copyright 2018-22 CCeH - Licensed under the GNU GPL v3 or later.
+© Copyright 2018-23 CCeH - Licensed under the GNU GPL v3 or later.
© Copyright 2018-22 CCeH - Licensed under the GNU GPL v3 or later.
+© Copyright 2018-23 CCeH - Licensed under the GNU GPL v3 or later.
© Copyright 2018-22 CCeH - Licensed under the GNU GPL v3 or later.
+© Copyright 2018-23 CCeH - Licensed under the GNU GPL v3 or later.
Built with Sphinx using a diff --git a/docs/overviews.html b/docs/overviews.html index 5e0a9931..be24667f 100644 --- a/docs/overviews.html +++ b/docs/overviews.html @@ -55,7 +55,10 @@© Copyright 2018-22 CCeH - Licensed under the GNU GPL v3 or later.
+© Copyright 2018-23 CCeH - Licensed under the GNU GPL v3 or later.
Description of the collation tool and the pre-processing of the TEI files.
+This overview describes the collation tool +and the pre-processing of the TEI files.
We extract every chapter of every capitular from all manuscripts and store them -in separate records in the Postgres database on the application Server. The -text stored in the database is normalized.
-If a manuscript contains more than one copy of a chapter, all copies are -extracted. If a corrector hand was active in the chapter, both an original and -a corrected version are extracted.
+We extract every chapter of every capitular from all manuscripts and store them in the +Postgres database. The text stored in the database is already normalized.
+If a manuscript contains more than one copy of a chapter, all copies are extracted. If +one or more correcting hands were active in the chapter, the original and each corrected +version are extracted.
The online collation tool knows about all versions and offers them to the user.
-The collation tool is divided in two parts, one frontend written in JavaScript and the -Vue.js library, and one backend application server written in Python. The application -server retrieves the chapters to collate from the database and collates them. The -results are sent to the frontend that does the formatting for display.
-The collation tool consists of two parts: one frontend written in JavaScript and using +the Vue.js library, and one backend application server written in Python and using the +super-collator library.
+The application server retrieves the chapters from the database and collates them. The +results of the collation are sent in json to the frontend that does the formatting for +display.
+The collation unit is the chapter, so that only short texts need to be collated, @@ -299,24 +303,118 @@
The application server uses an enhancement of the Needleman-Wunsch algorithm by Gotoh +[Gotoh1982]. This section provides a very high level overview of the algorithm.
+In phase 1 the algorithm builds a table. For example this is the table built for the +two strings: the quick brown fox jumps over the lazy dog and sick fox is crazy.
+sick | fox | is | crazy | ||||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
|
|
| |||||||||||||||||||||
the |
|
|
|
|
| ||||||||||||||||||||
quick |
|
|
|
|
| ||||||||||||||||||||
brown |
|
|
|
|
| ||||||||||||||||||||
fox |
|
|
|
|
| ||||||||||||||||||||
jumps |
|
|
|
|
| ||||||||||||||||||||
over |
|
|
|
|
| ||||||||||||||||||||
the |
|
|
|
|
| ||||||||||||||||||||
lazy |
|
|
|
|
| ||||||||||||||||||||
dog |
|
|
|
|
|
Every cell in the table contains three values, referred to as \(D\), \(P\), and \(Q\) in +Gotoh’s paper, and an arrow, like this:
+
+
|
+
The grayed cells in the first row and first column are initialized using the gap start +and gap extension penalties. The numbers for each remaining cell are calculated using +only values from the three cells, to the top-left, the top, and the left, of the current +cell:
+Finally the arrow in the current cell is set to point to that cell which yielded the +highest of the current cell’s \(D\), \(P\), and \(Q\) values.
+When the table is thus completed, two empty sequences are created. Then the algorithm +starts backtracking from the last (bottom-right) cell following the arrows until it +reaches the first (top-left) cell. If the arrow points:
+the word in the row header is added to the first sequence, a hyphen is added to the +second sequence,
+the word in the row header is added to the first sequence, the word in the column +header is added to the second sequence,
+a hyphen is added to the first sequence, the word in the column header is added to the +second sequence.
+sick | fox | is | crazy | ||||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
|
|
| |||||||||||||||||||||
the |
|
|
|
|
| ||||||||||||||||||||
quick |
|
|
|
|
| ||||||||||||||||||||
brown |
|
|
|
|
| ||||||||||||||||||||
fox |
|
|
|
|
| ||||||||||||||||||||
jumps |
|
|
|
|
| ||||||||||||||||||||
over |
|
|
|
|
| ||||||||||||||||||||
the |
|
|
|
|
| ||||||||||||||||||||
lazy |
|
|
|
|
| ||||||||||||||||||||
dog |
|
|
|
|
|
Finally the two sequences are reversed and printed.
+the | quick | brown | fox | jumps | over | the | lazy | dog |
- | sick | - | fox | is | - | - | crazy | - |
The algorithm can be customized by setting:
+a word comparison (similarity) function,
the starting gap penalty,
the gap opening penalty,
and the gap extension penalty.
The word comparison function returns a similarity value between 0 and 1. The -similarity is calculated as follows:
-All words in the input texts are split into sets of trigrams. The trigrams are -obtained by first prefixing and suffixing the word with two spaces respectively, -then cutting the resulting string into all possible strings of length 3. This -means that all trigrams partially overlap each other.
+The word comparison function returns a similarity value between 0 and 1, 0 being totally +different and 1 being completely equal. The chosen function is not critical to the +functioning of the aligner. The similarity should increase with the desirability of the +alignment, but otherwise there are no fixed rules.
+In the current implementation the similarity is calculated as follows:
+All words in the input texts are split into sets of trigrams (sometimes called +3-shingles). The trigrams are obtained by first prefixing and suffixing the word with +two spaces respectively, then cutting the resulting string into all possible strings of +length 3. This means that all trigrams partially overlap each other.
To calculate the similarity between two words three sets are built: the set of trigrams in word a, the set of trigrams in word b, and the set of trigrams common to both words. The similarity is then given by the formula:
The factor 2 was added to bring the similarity of identical words to 1.
+\[\mbox{similarity}(a,b)= \frac{2|set_{a} \cap set_{b}|}{|set_a| + |set_b|}\] +The factor of 2 was added to bring the similarity of identical words to 1.
+This is sometimes called the +Sørensen–Dice coefficient.
An example calculation follows:
-The similarity based on trigrams was chosen because its calculation can be done in @@ -447,16 +545,18 @@
Gotoh, O. 1982, An Improved Algorithm for Matching Biological Sequences, J. Mol. Biol. 162, 705-708 http://jaligner.sourceforge.net/references/gotoh1982.pdf
© Copyright 2018-22 CCeH - Licensed under the GNU GPL v3 or later.
+© Copyright 2018-23 CCeH - Licensed under the GNU GPL v3 or later.
© Copyright 2018-22 CCeH - Licensed under the GNU GPL v3 or later.
+© Copyright 2018-23 CCeH - Licensed under the GNU GPL v3 or later.
Built with Sphinx using a diff --git a/docs/overviews/html_generation.html b/docs/overviews/html_generation.html index de58fde0..a1ca6f9f 100644 --- a/docs/overviews/html_generation.html +++ b/docs/overviews/html_generation.html @@ -316,7 +316,7 @@© Copyright 2018-22 CCeH - Licensed under the GNU GPL v3 or later.
+© Copyright 2018-23 CCeH - Licensed under the GNU GPL v3 or later.
Built with Sphinx using a diff --git a/docs/overviews/meta_search.html b/docs/overviews/meta_search.html index d1b4525d..df6a4c8e 100644 --- a/docs/overviews/meta_search.html +++ b/docs/overviews/meta_search.html @@ -333,7 +333,7 @@© Copyright 2018-22 CCeH - Licensed under the GNU GPL v3 or later.
+© Copyright 2018-23 CCeH - Licensed under the GNU GPL v3 or later.
© Copyright 2018-22 CCeH - Licensed under the GNU GPL v3 or later.
+© Copyright 2018-23 CCeH - Licensed under the GNU GPL v3 or later.
© Copyright 2018-22 CCeH - Licensed under the GNU GPL v3 or later.
+© Copyright 2018-23 CCeH - Licensed under the GNU GPL v3 or later.
© Copyright 2018-22 CCeH - Licensed under the GNU GPL v3 or later.
+© Copyright 2018-23 CCeH - Licensed under the GNU GPL v3 or later.
© Copyright 2018-22 CCeH - Licensed under the GNU GPL v3 or later.
+© Copyright 2018-23 CCeH - Licensed under the GNU GPL v3 or later.
© Copyright 2018-22 CCeH - Licensed under the GNU GPL v3 or later.
+© Copyright 2018-23 CCeH - Licensed under the GNU GPL v3 or later.
Built with Sphinx using a diff --git a/docs/vm.html b/docs/vm.html index 34b29d86..9704a32f 100644 --- a/docs/vm.html +++ b/docs/vm.html @@ -211,7 +211,7 @@© Copyright 2018-22 CCeH - Licensed under the GNU GPL v3 or later.
+© Copyright 2018-23 CCeH - Licensed under the GNU GPL v3 or later.
© Copyright 2018-22 CCeH - Licensed under the GNU GPL v3 or later.
+© Copyright 2018-23 CCeH - Licensed under the GNU GPL v3 or later.
© Copyright 2018-22 CCeH - Licensed under the GNU GPL v3 or later.
+© Copyright 2018-23 CCeH - Licensed under the GNU GPL v3 or later.
Built with Sphinx using a diff --git a/docs/vm/apache/wordpress-custom.html b/docs/vm/apache/wordpress-custom.html index 5c8a4f64..f364433b 100644 --- a/docs/vm/apache/wordpress-custom.html +++ b/docs/vm/apache/wordpress-custom.html @@ -193,7 +193,7 @@© Copyright 2018-22 CCeH - Licensed under the GNU GPL v3 or later.
+© Copyright 2018-23 CCeH - Licensed under the GNU GPL v3 or later.
© Copyright 2018-22 CCeH - Licensed under the GNU GPL v3 or later.
+© Copyright 2018-23 CCeH - Licensed under the GNU GPL v3 or later.
© Copyright 2018-22 CCeH - Licensed under the GNU GPL v3 or later.
+© Copyright 2018-23 CCeH - Licensed under the GNU GPL v3 or later.
© Copyright 2018-22 CCeH - Licensed under the GNU GPL v3 or later.
+© Copyright 2018-23 CCeH - Licensed under the GNU GPL v3 or later.
Built with Sphinx using a diff --git a/docs/vm/apache/wordpress/plugins/collation-tool.html b/docs/vm/apache/wordpress/plugins/collation-tool.html index befcc709..d58908f6 100644 --- a/docs/vm/apache/wordpress/plugins/collation-tool.html +++ b/docs/vm/apache/wordpress/plugins/collation-tool.html @@ -476,7 +476,7 @@© Copyright 2018-22 CCeH - Licensed under the GNU GPL v3 or later.
+© Copyright 2018-23 CCeH - Licensed under the GNU GPL v3 or later.
© Copyright 2018-22 CCeH - Licensed under the GNU GPL v3 or later.
+© Copyright 2018-23 CCeH - Licensed under the GNU GPL v3 or later.
© Copyright 2018-22 CCeH - Licensed under the GNU GPL v3 or later.
+© Copyright 2018-23 CCeH - Licensed under the GNU GPL v3 or later.
© Copyright 2018-22 CCeH - Licensed under the GNU GPL v3 or later.
+© Copyright 2018-23 CCeH - Licensed under the GNU GPL v3 or later.
Highlight the search terms in the post.
-Highlight the full post if it we came through the search page. We use +
Highlight the full post if we came through the search page. We use the query string in the HTTP referrer to highlight the content.
The naive approach:
@@ -744,8 +744,8 @@plugins/cap-meta-sear
- on_cap_meta_search_the_permalink(permalink)¶
Get the permalink for the search result
-Return a link that will go to the post and highlight the search terms -if followed.
+Called from the search results page of the Capitularia theme. Return a link that +will go to the post and highlight the search terms if followed.
- Parameters:
@@ -797,7 +797,7 @@
plugins/meta-search
-Built with Sphinx using a diff --git a/docs/vm/apache/wordpress/plugins/page-generator.html b/docs/vm/apache/wordpress/plugins/page-generator.html index 365eddc9..eb2369f9 100644 --- a/docs/vm/apache/wordpress/plugins/page-generator.html +++ b/docs/vm/apache/wordpress/plugins/page-generator.html @@ -1616,7 +1616,7 @@© Copyright 2018-22 CCeH - Licensed under the GNU GPL v3 or later.
+© Copyright 2018-23 CCeH - Licensed under the GNU GPL v3 or later.
plugins/page-generato
-Built with Sphinx using a diff --git a/docs/vm/apache/wordpress/theme.html b/docs/vm/apache/wordpress/theme.html index f903b5c9..69784709 100644 --- a/docs/vm/apache/wordpress/theme.html +++ b/docs/vm/apache/wordpress/theme.html @@ -366,7 +366,7 @@© Copyright 2018-22 CCeH - Licensed under the GNU GPL v3 or later.
+© Copyright 2018-23 CCeH - Licensed under the GNU GPL v3 or later.
-Built with Sphinx using a diff --git a/docs/vm/apache/wordpress/theme/main.html b/docs/vm/apache/wordpress/theme/main.html index 8d6af5ea..eaf20ac1 100644 --- a/docs/vm/apache/wordpress/theme/main.html +++ b/docs/vm/apache/wordpress/theme/main.html @@ -1201,7 +1201,7 @@© Copyright 2018-22 CCeH - Licensed under the GNU GPL v3 or later.
+© Copyright 2018-23 CCeH - Licensed under the GNU GPL v3 or later.
themes/capitularia
-Built with Sphinx using a diff --git a/docs/vm/apache/wordpress/theme/widgets.html b/docs/vm/apache/wordpress/theme/widgets.html index 7f89a476..1c3683ce 100644 --- a/docs/vm/apache/wordpress/theme/widgets.html +++ b/docs/vm/apache/wordpress/theme/widgets.html @@ -1291,7 +1291,7 @@© Copyright 2018-22 CCeH - Licensed under the GNU GPL v3 or later.
+© Copyright 2018-23 CCeH - Licensed under the GNU GPL v3 or later.
themes/Capitularia/wi
-Built with Sphinx using a diff --git a/docs/vm/app_server.html b/docs/vm/app_server.html index 3c505a4a..6ce90570 100644 --- a/docs/vm/app_server.html +++ b/docs/vm/app_server.html @@ -744,7 +744,7 @@© Copyright 2018-22 CCeH - Licensed under the GNU GPL v3 or later.
+© Copyright 2018-23 CCeH - Licensed under the GNU GPL v3 or later.
Endpoints
-Built with Sphinx using a diff --git a/docs/vm/backup.html b/docs/vm/backup.html index ed7387fd..7acc01ad 100644 --- a/docs/vm/backup.html +++ b/docs/vm/backup.html @@ -225,7 +225,7 @@© Copyright 2018-22 CCeH - Licensed under the GNU GPL v3 or later.
+© Copyright 2018-23 CCeH - Licensed under the GNU GPL v3 or later.
Update
-Built with Sphinx using a diff --git a/docs/vm/database.html b/docs/vm/database.html index 918247f3..ee235d50 100644 --- a/docs/vm/database.html +++ b/docs/vm/database.html @@ -1585,7 +1585,7 @@© Copyright 2018-22 CCeH - Licensed under the GNU GPL v3 or later.
+© Copyright 2018-23 CCeH - Licensed under the GNU GPL v3 or later.
Schema gis: -
© Copyright 2018-22 CCeH - Licensed under the GNU GPL v3 or later.
+© Copyright 2018-23 CCeH - Licensed under the GNU GPL v3 or later.
Built with Sphinx using a diff --git a/docs/vm/intro.html b/docs/vm/intro.html index be88f985..fe17ee75 100644 --- a/docs/vm/intro.html +++ b/docs/vm/intro.html @@ -349,7 +349,7 @@Introduction -
© Copyright 2018-22 CCeH - Licensed under the GNU GPL v3 or later.
+© Copyright 2018-23 CCeH - Licensed under the GNU GPL v3 or later.
Built with Sphinx using a diff --git a/docs/vm/transformations.html b/docs/vm/transformations.html index 12f42603..c7fabe58 100644 --- a/docs/vm/transformations.html +++ b/docs/vm/transformations.html @@ -1756,7 +1756,7 @@Graph of Stylesheet D
-Built with Sphinx using a diff --git a/docs/vm/user.html b/docs/vm/user.html index b8684a89..0dea21bc 100644 --- a/docs/vm/user.html +++ b/docs/vm/user.html @@ -293,7 +293,7 @@© Copyright 2018-22 CCeH - Licensed under the GNU GPL v3 or later.
+© Copyright 2018-23 CCeH - Licensed under the GNU GPL v3 or later.
Security -
© Copyright 2018-22 CCeH - Licensed under the GNU GPL v3 or later.
+© Copyright 2018-23 CCeH - Licensed under the GNU GPL v3 or later.
Built with Sphinx using a