#59 - Create reader for new variable corpus format

- fixed some problems in the reader - added document title to JCAS metadata
openminted · Dec 14, 2017 · a624c09 · a624c09
1 parent 0a40f66
commit a624c09
Showing 1 changed file with 17 additions and 5 deletions.
diff --git a/...n/src/main/java/eu/openminted/uc/socialsciences/variabledetection/io/XmlCorpusReader.java b/...n/src/main/java/eu/openminted/uc/socialsciences/variabledetection/io/XmlCorpusReader.java
@@ -19,6 +19,7 @@
 package eu.openminted.uc.socialsciences.variabledetection.io;
 
 import de.tudarmstadt.ukp.dkpro.core.api.io.ResourceCollectionReaderBase;
+import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData;
 import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes;
 import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
 import eu.openminted.share.annotations.api.Component;
@@ -58,10 +59,9 @@
 import static org.apache.commons.io.IOUtils.closeQuietly;
 
 /**
- * Collection reader for PDF files using CERMINE
- * <a href="https://github.com/CeON/CERMINE">https://github.com/CeON/CERMINE</a>.
+ * Collection reader for Variable mention XML corpus
  */
-@ResourceMetaData(name = "CERMINE PDF Reader")
+@ResourceMetaData(name = "Variable Mention Corpus Reader")
 @MimeTypeCapability({ MimeTypes.APPLICATION_PDF })
 @TypeCapability(outputs = { "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData",
         "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Heading",
@@ -123,13 +123,22 @@ private void process(InputStream aInputStream, CAS aCAS) throws IOException
         if (document != null) {
             try {
                 XPath xpath = XPathFactory.newInstance().newXPath();
-                Node sampleNode = (Node) xpath.compile("//testset/topic/sample").evaluate(document,
+                Node sampleNode = (Node) xpath.compile("//testset/sample").evaluate(document,
                         XPathConstants.NODE);
+                if (sampleNode == null) {
+                    sampleNode = (Node) xpath.compile("//testset/topic/sample").evaluate(document,
+                            XPathConstants.NODE);
+                }
 
                 Node docNode = (Node) xpath.compile("./doc").evaluate(sampleNode, XPathConstants.NODE);
                 NamedNodeMap docAttributes = docNode.getAttributes();
                 language = docAttributes.getNamedItem("lang").getTextContent();
 
+                Node titleNode = (Node) xpath.compile(".//doc_title").evaluate(docNode,
+                        XPathConstants.NODE);
+                DocumentMetaData metadata = DocumentMetaData.get(aCAS.getJCas());
+                metadata.setDocumentTitle(titleNode.getTextContent());
+
                 NodeList sentenceNodes = (NodeList) xpath.compile(".//s").evaluate(docNode,
                         XPathConstants.NODESET);
                 Node sentenceNode = sentenceNodes.item(0);
@@ -171,7 +180,10 @@ private void process(InputStream aInputStream, CAS aCAS) throws IOException
                         String question = normalizeWhitespaces(questionNode.getTextContent().trim());
                         Node subQuestionNode = (Node) xpath.compile("./v_subquestion")
                                 .evaluate(variableNode, XPathConstants.NODE);
-                        String subQuestion = normalizeWhitespaces(subQuestionNode.getTextContent().trim());
+                        String subQuestion = "";
+                        if (subQuestionNode != null) {
+                            subQuestion = normalizeWhitespaces(subQuestionNode.getTextContent().trim());
+                        }
                         // TODO implement answer extraction
                         String answer = "";