#52 - Implement feature extractors from GESIS paper

- implemented TheSoz feature extractor - added TheSoz feature extractor to TrainTestPipeline
openminted · Sep 18, 2017 · d1d5d34 · d1d5d34
1 parent 9502973
commit d1d5d34
Show file tree

Hide file tree

Showing 7 changed files with 452 additions and 2 deletions.
diff --git a/ss-variable-detection/pom.xml b/ss-variable-detection/pom.xml
@@ -14,6 +14,7 @@
 	<properties>
 		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 		<lsr.version>0.8.1</lsr.version>
+		<jena.version>3.4.0</jena.version>
 	</properties>
 
 	<dependencyManagement>
@@ -112,6 +113,18 @@
 			<version>2.5.0</version>
 		</dependency>
 
+		<!-- Jena -->
+		<dependency>
+			<groupId>org.apache.jena</groupId>
+			<artifactId>jena-core</artifactId>
+			<version>${jena.version}</version>
+		</dependency>
+		<dependency>
+      <groupId>org.apache.jena</groupId>
+      <artifactId>jena-arq</artifactId>
+      <version>${jena.version}</version>
+    </dependency>
+
 		<dependency>
 			<groupId>junit</groupId>
 			<artifactId>junit</artifactId>

diff --git a/...on/src/main/java/eu/openminted/uc/socialsciences/variabledetection/TrainTestPipeline.java b/...on/src/main/java/eu/openminted/uc/socialsciences/variabledetection/TrainTestPipeline.java
@@ -32,8 +32,10 @@
 import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordLemmatizer;
 import de.tudarmstadt.ukp.dkpro.core.tokit.BreakIteratorSegmenter;
 import eu.openminted.uc.socialsciences.variabledetection.features.LuceneLemmaNGram;
+import eu.openminted.uc.socialsciences.variabledetection.features.TheSozFeatures;
 import eu.openminted.uc.socialsciences.variabledetection.features.WordnetFeatures;
 import eu.openminted.uc.socialsciences.variabledetection.io.TextDatasetReader;
+import eu.openminted.uc.socialsciences.variabledetection.resource.TheSozResource;
 import weka.attributeSelection.InfoGainAttributeEval;
 import weka.attributeSelection.Ranker;
 import weka.classifiers.bayes.NaiveBayes;
@@ -116,9 +118,11 @@ public static ParameterSpace getParameterSpace() throws ResourceInitializationEx
                         LuceneSkipNGram.PARAM_NGRAM_MIN_N, 2, LuceneSkipNGram.PARAM_NGRAM_MAX_N, 3),
                 TcFeatureFactory.create(NEFeatureExtractor.class),
                 TcFeatureFactory.create(WordnetFeatures.class, WordnetFeatures.PARAM_RESOURCE_NAME,
-                        "wordnet", WordnetFeatures.PARAM_RESOURCE_LANGUAGE, "en",
+                        WordnetFeatures.WORDNET_FIELD, WordnetFeatures.PARAM_RESOURCE_LANGUAGE, "en",
                         WordnetFeatures.PARAM_SYNONYM_FEATURE, true,
-                        WordnetFeatures.PARAM_HYPERNYM_FEATURE, false)));
+                        WordnetFeatures.PARAM_HYPERNYM_FEATURE, false),
+                TcFeatureFactory.create(TheSozFeatures.class, TheSozFeatures.PARAM_RESOURCE_NAME,
+                        TheSozResource.NAME)));
 
         // single-label feature selection (Weka specific options), reduces the feature set to 10
         Map<String, Object> dimFeatureSelection = new HashMap<String, Object>();

diff --git a/.../main/java/eu/openminted/uc/socialsciences/variabledetection/features/TheSozFeatures.java b/.../main/java/eu/openminted/uc/socialsciences/variabledetection/features/TheSozFeatures.java
@@ -0,0 +1,115 @@
+package eu.openminted.uc.socialsciences.variabledetection.features;
+
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.uima.fit.descriptor.ConfigurationParameter;
+import org.apache.uima.fit.descriptor.TypeCapability;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.resource.ResourceSpecifier;
+import org.dkpro.tc.api.exception.TextClassificationException;
+import org.dkpro.tc.api.features.Feature;
+import org.dkpro.tc.api.features.FeatureExtractor;
+import org.dkpro.tc.api.features.meta.MetaCollectorConfiguration;
+import org.dkpro.tc.api.type.TextClassificationTarget;
+import org.dkpro.tc.features.ngram.base.LuceneFeatureExtractorBase;
+
+import de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution;
+import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
+import de.tudarmstadt.ukp.dkpro.lexsemresource.exception.ResourceLoaderException;
+import eu.openminted.uc.socialsciences.variabledetection.resource.KnowledgeBaseFactory;
+import eu.openminted.uc.socialsciences.variabledetection.resource.KnowledgeBaseResource;
+import eu.openminted.uc.socialsciences.variabledetection.resource.TheSozResource;
+
+/**
+ * Extracts features using TheSoz knowledge base
+ */
+@TypeCapability(inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" })
+public class TheSozFeatures
+    extends LuceneFeatureExtractorBase
+    implements FeatureExtractor
+{
+    public static final String PARAM_RESOURCE_NAME = "knowledgeBaseName";
+    @ConfigurationParameter(name = PARAM_RESOURCE_NAME, mandatory = true)
+    protected String knowledgeBaseName;
+
+    protected KnowledgeBaseResource kbr;
+
+    @Override
+    public boolean initialize(ResourceSpecifier aSpecifier, Map<String, Object> aAdditionalParams)
+        throws ResourceInitializationException
+    {
+        if (!super.initialize(aSpecifier, aAdditionalParams)) {
+            return false;
+        }
+
+        try {
+            kbr = KnowledgeBaseFactory.getInstance().get(knowledgeBaseName);
+        }
+        catch (ResourceLoaderException e) {
+            throw new ResourceInitializationException(e);
+        }
+        return true;
+    }
+
+    @Override
+    public Set<Feature> extract(JCas view, TextClassificationTarget target)
+        throws TextClassificationException
+    {
+        FrequencyDistribution<String> featureVector = new FrequencyDistribution<>();
+        // TODO parameterize max ngram size
+        FrequencyDistribution<String> documentNgrams = TheSozMetaCollector.getDocumentNgrams(view,
+                true, false, 1, 4, stopwords, Token.class);
+        for (String ngram : documentNgrams.getKeys()) {
+            // TODO language check
+            if (kbr.containsConceptLabel(ngram)) {
+                featureVector.inc(ngram);
+            }
+        }
+
+        Set<Feature> features = new HashSet<Feature>();
+        for (String topNgram : topKSet.getKeys()) {
+            if (featureVector.getKeys().contains(topNgram)) {
+                features.add(new Feature(getFeaturePrefix() + "_" + topNgram, 1));
+            }
+            else {
+                features.add(new Feature(getFeaturePrefix() + "_" + topNgram, 0, true));
+            }
+        }
+        return features;
+    }
+
+    @Override
+    public List<MetaCollectorConfiguration> getMetaCollectorClasses(
+            Map<String, Object> parameterSettings)
+        throws ResourceInitializationException
+    {
+        return Arrays
+                .asList(new MetaCollectorConfiguration(TheSozMetaCollector.class, parameterSettings)
+                        .addStorageMapping(TheSozMetaCollector.PARAM_TARGET_LOCATION,
+                                TheSozFeatures.PARAM_SOURCE_LOCATION,
+                                TheSozMetaCollector.LUCENE_DIR));
+    }
+
+    @Override
+    protected String getFieldName()
+    {
+        return TheSozResource.NAME + featureExtractorName;
+    }
+
+    @Override
+    protected int getTopN()
+    {
+        return ngramUseTopK;
+    }
+
+    @Override
+    protected String getFeaturePrefix()
+    {
+        return "TheSoz-";
+    }
+}
diff --git a/.../java/eu/openminted/uc/socialsciences/variabledetection/features/TheSozMetaCollector.java b/.../java/eu/openminted/uc/socialsciences/variabledetection/features/TheSozMetaCollector.java
@@ -0,0 +1,109 @@
+package eu.openminted.uc.socialsciences.variabledetection.features;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.commons.lang.StringUtils;
+import org.apache.uima.UimaContext;
+import org.apache.uima.fit.descriptor.ConfigurationParameter;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.dkpro.tc.api.exception.TextClassificationException;
+import org.dkpro.tc.api.features.util.FeatureUtil;
+import org.dkpro.tc.features.ngram.meta.LuceneBasedMetaCollector;
+import org.dkpro.tc.features.ngram.util.NGramUtils;
+
+import de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution;
+import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
+import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
+import de.tudarmstadt.ukp.dkpro.core.ngrams.util.NGramStringListIterable;
+import de.tudarmstadt.ukp.dkpro.lexsemresource.exception.ResourceLoaderException;
+import eu.openminted.uc.socialsciences.variabledetection.resource.KnowledgeBaseFactory;
+import eu.openminted.uc.socialsciences.variabledetection.resource.KnowledgeBaseResource;
+import eu.openminted.uc.socialsciences.variabledetection.resource.TheSozResource;
+
+public class TheSozMetaCollector
+    extends LuceneBasedMetaCollector
+{
+    public static final String PARAM_RESOURCE_NAME = "knowledgeBaseName";
+    @ConfigurationParameter(name = PARAM_RESOURCE_NAME, mandatory = true)
+    protected String knowledgeBaseName;
+
+    protected KnowledgeBaseResource kbr;
+
+    public static final String PARAM_STOPWORDS_FILE = "stopwordsFile";
+    @ConfigurationParameter(name = PARAM_STOPWORDS_FILE, mandatory = false)
+    private String ngramStopwordsFile;
+
+    private Set<String> stopwords;
+
+    @Override
+    public void initialize(UimaContext context) throws ResourceInitializationException
+    {
+        super.initialize(context);
+
+        try {
+            stopwords = FeatureUtil.getStopwords(ngramStopwordsFile, false);
+        }
+        catch (IOException e) {
+            throw new ResourceInitializationException(e);
+        }
+        try {
+            kbr = KnowledgeBaseFactory.getInstance().get(TheSozResource.NAME);
+        }
+        catch (ResourceLoaderException e) {
+            throw new ResourceInitializationException(e);
+        }
+    }
+
+    @Override
+    protected FrequencyDistribution<String> getNgramsFD(JCas jcas)
+        throws TextClassificationException
+    {
+        FrequencyDistribution<String> frequencyDistribution = new FrequencyDistribution<>();
+
+        //TODO parameterize max ngram size
+        FrequencyDistribution<String> documentNgrams = getDocumentNgrams(jcas, true, false, 1, 4,
+                stopwords, Token.class);
+        for (String ngram : documentNgrams.getKeys()) {
+            // TODO language check
+            if (kbr.containsConceptLabel(ngram)) {
+                frequencyDistribution.inc(ngram);
+            }
+        }
+
+        return frequencyDistribution;
+    }
+
+    @Override
+    protected String getFieldName()
+    {
+        return "thesoz" + featureExtractorName;
+    }
+
+    public static FrequencyDistribution<String> getDocumentNgrams(JCas jcas,
+            boolean lowerCaseNGrams, boolean filterPartialMatches, int minN, int maxN,
+            Set<String> stopwords, Class<? extends Annotation> annotationClass)
+        throws TextClassificationException
+    {
+        final String ngramGlue = " ";
+        FrequencyDistribution<String> documentNgrams = new FrequencyDistribution<String>();
+        for (Sentence s : JCasUtil.select(jcas, Sentence.class)) {
+            List<String> strings = NGramUtils.valuesToText(jcas, s, annotationClass.getName());
+            for (List<String> ngram : new NGramStringListIterable(strings, minN, maxN)) {
+                if (lowerCaseNGrams) {
+                    ngram = NGramUtils.lower(ngram);
+                }
+
+                if (NGramUtils.passesNgramFilter(ngram, stopwords, filterPartialMatches)) {
+                    String ngramString = StringUtils.join(ngram, ngramGlue);
+                    documentNgrams.inc(ngramString);
+                }
+            }
+        }
+        return documentNgrams;
+    }
+}
diff --git a/...java/eu/openminted/uc/socialsciences/variabledetection/resource/KnowledgeBaseFactory.java b/...java/eu/openminted/uc/socialsciences/variabledetection/resource/KnowledgeBaseFactory.java
@@ -0,0 +1,120 @@
+package eu.openminted.uc.socialsciences.variabledetection.resource;
+
+import java.io.File;
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+
+import org.springframework.context.support.FileSystemXmlApplicationContext;
+
+import de.tudarmstadt.ukp.dkpro.lexsemresource.core.ResourceFactory;
+import de.tudarmstadt.ukp.dkpro.lexsemresource.exception.ResourceLoaderException;
+
+/**
+ * Copied from {@link ResourceFactory}
+ */
+public class KnowledgeBaseFactory
+{
+    public static final String ENV_DKPRO_HOME = "DKPRO_HOME";
+    public final static String CONFIG_FILE = "resources.xml";
+
+    private static KnowledgeBaseFactory loader;
+
+    private FileSystemXmlApplicationContext context;
+
+    public static synchronized KnowledgeBaseFactory getInstance() throws ResourceLoaderException
+    {
+        if (loader == null) {
+            List<String> locs = new ArrayList<String>();
+            URL resourceXmlUrl = null;
+
+            // Check in workspace
+            try {
+                File f = new File(getWorkspace(), CONFIG_FILE);
+                if (f.isFile()) {
+                    try {
+                        resourceXmlUrl = f.toURI().toURL();
+                    }
+                    catch (MalformedURLException e) {
+                        throw new ResourceLoaderException(e);
+                    }
+                }
+                locs.add(f.getAbsolutePath());
+            }
+            catch (IOException e) {
+                locs.add("DKPro workspace not available");
+            }
+
+            // Check in classpath
+            if (resourceXmlUrl == null) {
+                resourceXmlUrl = ResourceFactory.class.getResource(CONFIG_FILE);
+                locs.add("Classpath: " + CONFIG_FILE);
+            }
+
+            // Check in default file system location
+            if (resourceXmlUrl == null && new File(CONFIG_FILE).isFile()) {
+                try {
+                    resourceXmlUrl = new File(CONFIG_FILE).toURI().toURL();
+                }
+                catch (MalformedURLException e) {
+                    throw new ResourceLoaderException(e);
+                }
+                locs.add(new File(CONFIG_FILE).getAbsolutePath());
+            }
+
+            // Bail out if still not found
+            if (resourceXmlUrl == null) {
+                throw new ResourceLoaderException("Unable to locate configuration file ["
+                        + CONFIG_FILE + "] in " + locs.toString());
+            }
+
+            loader = new KnowledgeBaseFactory(resourceXmlUrl.toString());
+        }
+        return loader;
+    }
+
+    /**
+     * Constructor parameterized by the path to the configuration file.
+     *
+     * @param location
+     *            location of the configuration file.
+     */
+    public KnowledgeBaseFactory(String location)
+    {
+        context = new FileSystemXmlApplicationContext(location);
+    }
+
+    /**
+     * @return All registered resources. ResourceLoaderExceptions are catched and ignored to all for
+     *         easy iteration over all resources runnalbe on the current system.
+     */
+    public Collection<KnowledgeBaseResource> getAll()
+    {
+        return context.getBeansOfType(KnowledgeBaseResource.class).values();
+    }
+
+    /**
+     * Get the workspace directory.
+     *
+     * @return the workspace directory.
+     * @throws IOException
+     *             if the workspace cannot be obtained
+     */
+    private static File getWorkspace() throws IOException
+    {
+        if (System.getenv(ENV_DKPRO_HOME) != null) {
+            File f = new File(System.getenv(ENV_DKPRO_HOME));
+            return new File(f, ResourceFactory.class.getName());
+        }
+
+        throw new IOException("Environment variable [" + ENV_DKPRO_HOME + "] not set");
+    }
+
+    public KnowledgeBaseResource get(String name)
+    {
+        return (KnowledgeBaseResource) context.getBean(name, KnowledgeBaseResource.class);
+    }
+}