Skip to content

Commit

Permalink
Fix pagebreaks when doc lower than match doc number
Browse files Browse the repository at this point in the history
Change-Id: I6255b44c523c7fea1656c78a1aa18db1febfc4c3
  • Loading branch information
margaretha authored and Akron committed Mar 28, 2023
1 parent f420cb3 commit 405413e
Show file tree
Hide file tree
Showing 4 changed files with 124 additions and 52 deletions.
3 changes: 3 additions & 0 deletions Changes
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
0.61.2 2023-03-28
- [bugfix] Fix pagebreak retrieval (margaretha, diewald)

0.61.1 2023-02-14
- [bugfix] Fixed ensuring same documents of spans (solved #87,
margaretha)
Expand Down
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@

<groupId>de.ids_mannheim.korap</groupId>
<artifactId>Krill</artifactId>
<version>0.61.1</version>
<version>0.61.2</version>
<packaging>jar</packaging>

<name>Krill</name>
Expand Down
77 changes: 56 additions & 21 deletions src/main/java/de/ids_mannheim/korap/response/Match.java
Original file line number Diff line number Diff line change
Expand Up @@ -501,7 +501,6 @@ public void addRelation (int srcStart,
};
};


public void addPagebreak (int start, int pagenumber) {
this.addHighlight(new Highlight(start, pagenumber));
};
Expand Down Expand Up @@ -864,11 +863,13 @@ public List<int[]> retrievePagebreaks (LeafReaderContext atomic,

int charOffset = 0, pagenumber = 0, start = 0;

if (DEBUG)
if (DEBUG) {
log.debug("=================================");
log.debug("Retrieve pagebreaks between {}-{}",
this.getStartPos(),
this.getEndPos());

};

try {

// Store character offsets in ByteBuffer
Expand All @@ -890,19 +891,26 @@ public List<int[]> retrievePagebreaks (LeafReaderContext atomic,
while (pagebreakSpans.next() == true) {

if (DEBUG) {
log.debug("There is a pagebreak at {}/{}",
log.debug("There is a pagebreak at {}/{} and we are at {}",
pagebreakSpans.doc(),
pagebreakSpans.start());
pagebreakSpans.start(),
this.localDocID);
};

// Current pagebreak is not in the correct document
if (pagebreakSpans.doc() != this.localDocID) {
pagebreakSpans.skipTo(this.localDocID);

// No pagebreaks in this document
if (pagebreakSpans.doc() != this.localDocID)
break;
};
if (pagebreakSpans.doc() != this.localDocID) {
if (pagebreakSpans.doc() < this.localDocID) {
pagebreakSpans.skipTo(this.localDocID);

// No pagebreaks in this document
if (pagebreakSpans.doc() != this.localDocID)
break;
}
else {
break;
};
continue;
};

if (DEBUG)
log.debug("The pagebreak occurs in the document");
Expand All @@ -911,16 +919,18 @@ public List<int[]> retrievePagebreaks (LeafReaderContext atomic,
// if it is in the correct area
if (pagebreakSpans.start() <= this.getStartPos()) {

if (DEBUG)
log.debug("PB start position is before match at {}",
pagebreakSpans.start());

// Only the first payload is relevant
b = pagebreakSpans.getPayload().iterator().next();
start = pagebreakSpans.start();

if (DEBUG)
log.debug("PB start position is before match at {}:{}",
pagebreakSpans.start(),
b);

}

// This is the first pagebreak!
// This is the first pagebreak inside the match!
else {

// b is already defined!
Expand All @@ -937,6 +947,7 @@ public List<int[]> retrievePagebreaks (LeafReaderContext atomic,

// This is the first pagebreak!
pagebreaks.add(new int[]{charOffset, pagenumber});

if (start >= this.getStartPos()) {

if (DEBUG)
Expand All @@ -945,6 +956,7 @@ public List<int[]> retrievePagebreaks (LeafReaderContext atomic,
pagenumber);
this.addPagebreak(charOffset, pagenumber);
};
b = null;
}

// b wasn't used yet
Expand All @@ -963,17 +975,41 @@ public List<int[]> retrievePagebreaks (LeafReaderContext atomic,
// This is the first pagebreak!
pagebreaks.add(new int[]{charOffset, pagenumber});
this.addPagebreak(charOffset,pagenumber);
b = null;
}

// Pagebreak beyond the current position
else {
break;
};

// Reset byte
b = null;
};
};

if (b != null) {
bb.rewind();
bb.put(b);
bb.rewind();

pagenumber = bb.getInt();
charOffset = bb.getInt();

if (DEBUG)
log.debug("Add pagebreak to list: {}-{}", charOffset, pagenumber);

// This is a remembered pagebreak!
pagebreaks.add(new int[]{charOffset, pagenumber});

if (start >= this.getStartPos()) {

if (DEBUG)
log.debug("Add pagebreak to rendering: {}-{}",
charOffset,
pagenumber);
this.addPagebreak(charOffset, pagenumber);
};

b = null;
};
}
catch (Exception e) {
log.warn("Some problems with ByteBuffer: {}", e.getMessage());
Expand All @@ -988,7 +1024,6 @@ public List<int[]> retrievePagebreaks (LeafReaderContext atomic,
return pagebreaks;
};


// Expand the context to a span
public int[] expandContextToSpan (String element) {

Expand Down
94 changes: 64 additions & 30 deletions src/test/java/de/ids_mannheim/korap/index/TestPagebreakIndex.java
Original file line number Diff line number Diff line change
@@ -1,59 +1,95 @@
package de.ids_mannheim.korap.index;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.fail;

import java.io.IOException;

import org.apache.lucene.index.Term;
import org.apache.lucene.search.spans.SpanOrQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.junit.Ignore;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;

import de.ids_mannheim.korap.KrillCollection;
import de.ids_mannheim.korap.Krill;
import de.ids_mannheim.korap.KrillIndex;
import de.ids_mannheim.korap.query.QueryBuilder;
import de.ids_mannheim.korap.query.SpanClassQuery;
import de.ids_mannheim.korap.query.SpanElementQuery;
import de.ids_mannheim.korap.query.SpanFocusQuery;
import de.ids_mannheim.korap.query.SpanNextQuery;
import de.ids_mannheim.korap.query.SpanWithinQuery;
import de.ids_mannheim.korap.query.QueryBuilder;
import de.ids_mannheim.korap.query.wrap.SpanQueryWrapper;
import de.ids_mannheim.korap.response.Match;
import de.ids_mannheim.korap.response.Result;
import de.ids_mannheim.korap.response.SearchContext;

/*
* Retrieve pagebreak annotations
*/

@RunWith(JUnit4.class)
public class TestPagebreakIndex {

private FieldDocument createFieldDoc0 () {
// abcde
FieldDocument fd = new FieldDocument();
fd.addTV("tokens", "abcde",
"[(0-1)s:a|i:a|_0$<i>0<i>1|-:t$<i>5]" +
"[(1-2)s:b|i:b|_1$<i>1<i>2]" +
"[(2-3)s:c|i:c|_2$<i>2<i>3]" +
"[(3-4)s:a|i:d|_3$<i>3<i>4]" +
"[(4-5)s:b|i:e|_4$<i>4<i>5]"
);
return fd;
}

private FieldDocument createFieldDoc1 () {
// abcabcabac
FieldDocument fd = new FieldDocument();
fd.addTV("tokens", "abcabcabac",
"[(0-1)s:a|i:a|_0$<i>0<i>1|-:t$<i>10|~:base/s:pb$<i>528<i>0]" +
"[(1-2)s:b|i:b|_1$<i>1<i>2]" +
"[(2-3)s:c|i:c|_2$<i>2<i>3]" +
"[(3-4)s:a|i:a|_3$<i>3<i>4]" +
"[(4-5)s:b|i:b|_4$<i>4<i>5]" +
"[(5-6)s:c|i:c|_5$<i>5<i>6|~:base/s:pb$<i>529<i>5]" +
"[(6-7)s:a|i:a|_6$<i>6<i>7]" +
"[(7-8)s:b|i:b|_7$<i>7<i>8]" +
"[(8-9)s:a|i:a|_8$<i>8<i>9|~:base/s:pb$<i>530<i>8]" +
"[(9-10)s:c|i:c|_9$<i>9<i>10]");
return fd;
}

@Test
public void testPageBreakDocLowerThanLocalDocId () throws IOException {
KrillIndex ki = new KrillIndex();
ki.addDoc(createFieldDoc0());
ki.addDoc(createFieldDoc1());
ki.commit();

SpanTermQuery sq = new SpanTermQuery(new Term("tokens", "s:c"));
Result kr = ki.search(sq, (short) 10);
assertEquals(4, kr.getMatches().size());

assertEquals(2, kr.getMatch(0).getStartPos());
assertEquals(3, kr.getMatch(0).getEndPos());
assertEquals(-1, kr.getMatch(0).getStartPage());
assertEquals(-1, kr.getMatch(0).getEndPage());

assertEquals(2, kr.getMatch(1).getStartPos());
assertEquals(3, kr.getMatch(1).getEndPos());
assertEquals(528, kr.getMatch(1).getStartPage());
assertEquals(-1, kr.getMatch(1).getEndPage());

assertEquals(5, kr.getMatch(2).getStartPos());
assertEquals(6, kr.getMatch(2).getEndPos());
assertEquals(529, kr.getMatch(2).getStartPage());
assertEquals(-1, kr.getMatch(2).getEndPage());

assertEquals(9, kr.getMatch(3).getStartPos());
assertEquals(10, kr.getMatch(3).getEndPos());
assertEquals(530, kr.getMatch(3).getStartPage());
assertEquals(-1, kr.getMatch(3).getEndPage());
};

@Test
public void indexExample1 () throws Exception {
KrillIndex ki = new KrillIndex();

// abcabcabac
FieldDocument fd = new FieldDocument();
fd.addTV("tokens", "abcabcabac",
"[(0-1)s:a|i:a|_0$<i>0<i>1|-:t$<i>10|~:base/s:pb$<i>528<i>0]" +
"[(1-2)s:b|i:b|_1$<i>1<i>2]" +
"[(2-3)s:c|i:c|_2$<i>2<i>3]" +
"[(3-4)s:a|i:a|_3$<i>3<i>4]" +
"[(4-5)s:b|i:b|_4$<i>4<i>5]" +
"[(5-6)s:c|i:c|_5$<i>5<i>6|~:base/s:pb$<i>529<i>5]" +
"[(6-7)s:a|i:a|_6$<i>6<i>7]" +
"[(7-8)s:b|i:b|_7$<i>7<i>8]" +
"[(8-9)s:a|i:a|_8$<i>8<i>9|~:base/s:pb$<i>530<i>8]" +
"[(9-10)s:c|i:c|_9$<i>9<i>10]");
ki.addDoc(fd);
ki.addDoc(createFieldDoc1());
ki.commit();

SpanQuery sq;
Expand Down Expand Up @@ -88,8 +124,6 @@ public void indexExample1 () throws Exception {
"</span>",
kr.getMatch(0).getSnippetHTML());

/*
QueryBuilder qb = new QueryBuilder("tokens");
sq = qb.seq().append(
qb.repeat(
Expand All @@ -106,6 +140,7 @@ public void indexExample1 () throws Exception {

assertEquals(528, kr.getMatch(0).getStartPage());
assertEquals(529, kr.getMatch(0).getEndPage());

assertEquals(
"snippetHTML",
"<span class=\"context-left\"></span>"+
Expand All @@ -121,6 +156,5 @@ public void indexExample1 () throws Exception {
"bac"+
"</span>",
kr.getMatch(0).getSnippetHTML());
*/
};
};

0 comments on commit 405413e

Please sign in to comment.