#52 - Implement feature extractors from GESIS paper

- added lemma Ngram feature extractor
openminted · Sep 1, 2017 · 178422a · 178422a
1 parent 69c186a
commit 178422a
Showing 1 changed file with 71 additions and 0 deletions.
diff --git a/...ain/java/eu/openminted/uc/socialsciences/variabledetection/features/LuceneLemmaNGram.java b/...ain/java/eu/openminted/uc/socialsciences/variabledetection/features/LuceneLemmaNGram.java
@@ -0,0 +1,71 @@
+/*******************************************************************************
+ * Copyright 2017
+ * Ubiquitous Knowledge Processing (UKP) Lab
+ * Technische Universität Darmstadt
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ ******************************************************************************/
+package eu.openminted.uc.socialsciences.variabledetection.features;
+
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.uima.fit.descriptor.TypeCapability;
+import org.apache.uima.jcas.JCas;
+import org.dkpro.tc.api.exception.TextClassificationException;
+import org.dkpro.tc.api.features.Feature;
+import org.dkpro.tc.api.features.FeatureExtractor;
+import org.dkpro.tc.api.type.TextClassificationTarget;
+import org.dkpro.tc.features.ngram.LuceneNGram;
+import org.dkpro.tc.features.ngram.util.NGramUtils;
+
+import de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution;
+import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma;
+
+/**
+ * Extracts token n-grams within the given text classification unit
+ */
+@TypeCapability(inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence",
+        "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma" })
+public class LuceneLemmaNGram
+    extends LuceneNGram
+    implements FeatureExtractor
+{
+
+    @Override
+    public Set<Feature> extract(JCas jcas, TextClassificationTarget target)
+        throws TextClassificationException
+    {
+        Set<Feature> features = new HashSet<Feature>();
+        FrequencyDistribution<String> documentNgrams = null;
+
+        documentNgrams = NGramUtils.getDocumentNgrams(jcas, target, ngramLowerCase,
+                filterPartialStopwordMatches, ngramMinN, ngramMaxN, stopwords, Lemma.class);
+
+        for (String topNgram : topKSet.getKeys()) {
+            if (documentNgrams.getKeys().contains(topNgram)) {
+                features.add(new Feature(getFeaturePrefix() + "_" + topNgram, 1));
+            }
+            else {
+                features.add(new Feature(getFeaturePrefix() + "_" + topNgram, 0, true));
+            }
+        }
+        return features;
+    }
+
+    @Override
+    protected String getFeaturePrefix()
+    {
+        return "lemma-ngram";
+    }
+}