Skip to content

Commit

Permalink
#59 - Create reader for new variable corpus format
Browse files Browse the repository at this point in the history
- fixed some problems in the reader
- added document title to JCAS metadata
  • Loading branch information
maxxkia committed Dec 14, 2017
1 parent 0a40f66 commit a624c09
Showing 1 changed file with 17 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
package eu.openminted.uc.socialsciences.variabledetection.io;

import de.tudarmstadt.ukp.dkpro.core.api.io.ResourceCollectionReaderBase;
import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData;
import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import eu.openminted.share.annotations.api.Component;
Expand Down Expand Up @@ -58,10 +59,9 @@
import static org.apache.commons.io.IOUtils.closeQuietly;

/**
* Collection reader for PDF files using CERMINE
* <a href="https://github.com/CeON/CERMINE">https://github.com/CeON/CERMINE</a>.
* Collection reader for Variable mention XML corpus
*/
@ResourceMetaData(name = "CERMINE PDF Reader")
@ResourceMetaData(name = "Variable Mention Corpus Reader")
@MimeTypeCapability({ MimeTypes.APPLICATION_PDF })
@TypeCapability(outputs = { "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData",
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Heading",
Expand Down Expand Up @@ -123,13 +123,22 @@ private void process(InputStream aInputStream, CAS aCAS) throws IOException
if (document != null) {
try {
XPath xpath = XPathFactory.newInstance().newXPath();
Node sampleNode = (Node) xpath.compile("//testset/topic/sample").evaluate(document,
Node sampleNode = (Node) xpath.compile("//testset/sample").evaluate(document,
XPathConstants.NODE);
if (sampleNode == null) {
sampleNode = (Node) xpath.compile("//testset/topic/sample").evaluate(document,
XPathConstants.NODE);
}

Node docNode = (Node) xpath.compile("./doc").evaluate(sampleNode, XPathConstants.NODE);
NamedNodeMap docAttributes = docNode.getAttributes();
language = docAttributes.getNamedItem("lang").getTextContent();

Node titleNode = (Node) xpath.compile(".//doc_title").evaluate(docNode,
XPathConstants.NODE);
DocumentMetaData metadata = DocumentMetaData.get(aCAS.getJCas());
metadata.setDocumentTitle(titleNode.getTextContent());

NodeList sentenceNodes = (NodeList) xpath.compile(".//s").evaluate(docNode,
XPathConstants.NODESET);
Node sentenceNode = sentenceNodes.item(0);
Expand Down Expand Up @@ -171,7 +180,10 @@ private void process(InputStream aInputStream, CAS aCAS) throws IOException
String question = normalizeWhitespaces(questionNode.getTextContent().trim());
Node subQuestionNode = (Node) xpath.compile("./v_subquestion")
.evaluate(variableNode, XPathConstants.NODE);
String subQuestion = normalizeWhitespaces(subQuestionNode.getTextContent().trim());
String subQuestion = "";
if (subQuestionNode != null) {
subQuestion = normalizeWhitespaces(subQuestionNode.getTextContent().trim());
}
// TODO implement answer extraction
String answer = "";

Expand Down

0 comments on commit a624c09

Please sign in to comment.