Skip to content

Commit

Permalink
Update deduper.rs (#197)
Browse files Browse the repository at this point in the history
  • Loading branch information
soldni authored Sep 4, 2024
1 parent 1f72ce3 commit 621a6f4
Showing 1 changed file with 3 additions and 2 deletions.
5 changes: 3 additions & 2 deletions src/deduper.rs
Original file line number Diff line number Diff line change
Expand Up @@ -297,13 +297,14 @@ fn write_attributes(
// skip empty documents if text_length is 0
for p in paragraphs {
let par_start = offset;
offset += p.chars().count();
let par_char_length = p.chars().count();
offset += par_char_length;
if offset < text_length - 1 {
offset += 1; // For the newline
}
let par_end = offset;

if offset < min_content_length {
if par_char_length < min_content_length {
// skip length 0 paragraphs
continue;
}
Expand Down

0 comments on commit 621a6f4

Please sign in to comment.