Skip to content

Commit

Permalink
#52 - Implement feature extractors from GESIS paper
Browse files Browse the repository at this point in the history
- implemented TheSoz feature extractor
- added TheSoz feature extractor to TrainTestPipeline
  • Loading branch information
maxxkia committed Sep 18, 2017
1 parent 9502973 commit d1d5d34
Show file tree
Hide file tree
Showing 7 changed files with 452 additions and 2 deletions.
13 changes: 13 additions & 0 deletions ss-variable-detection/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<lsr.version>0.8.1</lsr.version>
<jena.version>3.4.0</jena.version>
</properties>

<dependencyManagement>
Expand Down Expand Up @@ -112,6 +113,18 @@
<version>2.5.0</version>
</dependency>

<!-- Jena -->
<dependency>
<groupId>org.apache.jena</groupId>
<artifactId>jena-core</artifactId>
<version>${jena.version}</version>
</dependency>
<dependency>
<groupId>org.apache.jena</groupId>
<artifactId>jena-arq</artifactId>
<version>${jena.version}</version>
</dependency>

<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,10 @@
import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordLemmatizer;
import de.tudarmstadt.ukp.dkpro.core.tokit.BreakIteratorSegmenter;
import eu.openminted.uc.socialsciences.variabledetection.features.LuceneLemmaNGram;
import eu.openminted.uc.socialsciences.variabledetection.features.TheSozFeatures;
import eu.openminted.uc.socialsciences.variabledetection.features.WordnetFeatures;
import eu.openminted.uc.socialsciences.variabledetection.io.TextDatasetReader;
import eu.openminted.uc.socialsciences.variabledetection.resource.TheSozResource;
import weka.attributeSelection.InfoGainAttributeEval;
import weka.attributeSelection.Ranker;
import weka.classifiers.bayes.NaiveBayes;
Expand Down Expand Up @@ -116,9 +118,11 @@ public static ParameterSpace getParameterSpace() throws ResourceInitializationEx
LuceneSkipNGram.PARAM_NGRAM_MIN_N, 2, LuceneSkipNGram.PARAM_NGRAM_MAX_N, 3),
TcFeatureFactory.create(NEFeatureExtractor.class),
TcFeatureFactory.create(WordnetFeatures.class, WordnetFeatures.PARAM_RESOURCE_NAME,
"wordnet", WordnetFeatures.PARAM_RESOURCE_LANGUAGE, "en",
WordnetFeatures.WORDNET_FIELD, WordnetFeatures.PARAM_RESOURCE_LANGUAGE, "en",
WordnetFeatures.PARAM_SYNONYM_FEATURE, true,
WordnetFeatures.PARAM_HYPERNYM_FEATURE, false)));
WordnetFeatures.PARAM_HYPERNYM_FEATURE, false),
TcFeatureFactory.create(TheSozFeatures.class, TheSozFeatures.PARAM_RESOURCE_NAME,
TheSozResource.NAME)));

// single-label feature selection (Weka specific options), reduces the feature set to 10
Map<String, Object> dimFeatureSelection = new HashMap<String, Object>();
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
package eu.openminted.uc.socialsciences.variabledetection.features;

import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.TypeCapability;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.ResourceSpecifier;
import org.dkpro.tc.api.exception.TextClassificationException;
import org.dkpro.tc.api.features.Feature;
import org.dkpro.tc.api.features.FeatureExtractor;
import org.dkpro.tc.api.features.meta.MetaCollectorConfiguration;
import org.dkpro.tc.api.type.TextClassificationTarget;
import org.dkpro.tc.features.ngram.base.LuceneFeatureExtractorBase;

import de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import de.tudarmstadt.ukp.dkpro.lexsemresource.exception.ResourceLoaderException;
import eu.openminted.uc.socialsciences.variabledetection.resource.KnowledgeBaseFactory;
import eu.openminted.uc.socialsciences.variabledetection.resource.KnowledgeBaseResource;
import eu.openminted.uc.socialsciences.variabledetection.resource.TheSozResource;

/**
* Extracts features using TheSoz knowledge base
*/
@TypeCapability(inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" })
public class TheSozFeatures
extends LuceneFeatureExtractorBase
implements FeatureExtractor
{
public static final String PARAM_RESOURCE_NAME = "knowledgeBaseName";
@ConfigurationParameter(name = PARAM_RESOURCE_NAME, mandatory = true)
protected String knowledgeBaseName;

protected KnowledgeBaseResource kbr;

@Override
public boolean initialize(ResourceSpecifier aSpecifier, Map<String, Object> aAdditionalParams)
throws ResourceInitializationException
{
if (!super.initialize(aSpecifier, aAdditionalParams)) {
return false;
}

try {
kbr = KnowledgeBaseFactory.getInstance().get(knowledgeBaseName);
}
catch (ResourceLoaderException e) {
throw new ResourceInitializationException(e);
}
return true;
}

@Override
public Set<Feature> extract(JCas view, TextClassificationTarget target)
throws TextClassificationException
{
FrequencyDistribution<String> featureVector = new FrequencyDistribution<>();
// TODO parameterize max ngram size
FrequencyDistribution<String> documentNgrams = TheSozMetaCollector.getDocumentNgrams(view,
true, false, 1, 4, stopwords, Token.class);
for (String ngram : documentNgrams.getKeys()) {
// TODO language check
if (kbr.containsConceptLabel(ngram)) {
featureVector.inc(ngram);
}
}

Set<Feature> features = new HashSet<Feature>();
for (String topNgram : topKSet.getKeys()) {
if (featureVector.getKeys().contains(topNgram)) {
features.add(new Feature(getFeaturePrefix() + "_" + topNgram, 1));
}
else {
features.add(new Feature(getFeaturePrefix() + "_" + topNgram, 0, true));
}
}
return features;
}

@Override
public List<MetaCollectorConfiguration> getMetaCollectorClasses(
Map<String, Object> parameterSettings)
throws ResourceInitializationException
{
return Arrays
.asList(new MetaCollectorConfiguration(TheSozMetaCollector.class, parameterSettings)
.addStorageMapping(TheSozMetaCollector.PARAM_TARGET_LOCATION,
TheSozFeatures.PARAM_SOURCE_LOCATION,
TheSozMetaCollector.LUCENE_DIR));
}

@Override
protected String getFieldName()
{
return TheSozResource.NAME + featureExtractorName;
}

@Override
protected int getTopN()
{
return ngramUseTopK;
}

@Override
protected String getFeaturePrefix()
{
return "TheSoz-";
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
package eu.openminted.uc.socialsciences.variabledetection.features;

import java.io.IOException;
import java.util.List;
import java.util.Set;

import org.apache.commons.lang.StringUtils;
import org.apache.uima.UimaContext;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.resource.ResourceInitializationException;
import org.dkpro.tc.api.exception.TextClassificationException;
import org.dkpro.tc.api.features.util.FeatureUtil;
import org.dkpro.tc.features.ngram.meta.LuceneBasedMetaCollector;
import org.dkpro.tc.features.ngram.util.NGramUtils;

import de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import de.tudarmstadt.ukp.dkpro.core.ngrams.util.NGramStringListIterable;
import de.tudarmstadt.ukp.dkpro.lexsemresource.exception.ResourceLoaderException;
import eu.openminted.uc.socialsciences.variabledetection.resource.KnowledgeBaseFactory;
import eu.openminted.uc.socialsciences.variabledetection.resource.KnowledgeBaseResource;
import eu.openminted.uc.socialsciences.variabledetection.resource.TheSozResource;

public class TheSozMetaCollector
extends LuceneBasedMetaCollector
{
public static final String PARAM_RESOURCE_NAME = "knowledgeBaseName";
@ConfigurationParameter(name = PARAM_RESOURCE_NAME, mandatory = true)
protected String knowledgeBaseName;

protected KnowledgeBaseResource kbr;

public static final String PARAM_STOPWORDS_FILE = "stopwordsFile";
@ConfigurationParameter(name = PARAM_STOPWORDS_FILE, mandatory = false)
private String ngramStopwordsFile;

private Set<String> stopwords;

@Override
public void initialize(UimaContext context) throws ResourceInitializationException
{
super.initialize(context);

try {
stopwords = FeatureUtil.getStopwords(ngramStopwordsFile, false);
}
catch (IOException e) {
throw new ResourceInitializationException(e);
}
try {
kbr = KnowledgeBaseFactory.getInstance().get(TheSozResource.NAME);
}
catch (ResourceLoaderException e) {
throw new ResourceInitializationException(e);
}
}

@Override
protected FrequencyDistribution<String> getNgramsFD(JCas jcas)
throws TextClassificationException
{
FrequencyDistribution<String> frequencyDistribution = new FrequencyDistribution<>();

//TODO parameterize max ngram size
FrequencyDistribution<String> documentNgrams = getDocumentNgrams(jcas, true, false, 1, 4,
stopwords, Token.class);
for (String ngram : documentNgrams.getKeys()) {
// TODO language check
if (kbr.containsConceptLabel(ngram)) {
frequencyDistribution.inc(ngram);
}
}

return frequencyDistribution;
}

@Override
protected String getFieldName()
{
return "thesoz" + featureExtractorName;
}

public static FrequencyDistribution<String> getDocumentNgrams(JCas jcas,
boolean lowerCaseNGrams, boolean filterPartialMatches, int minN, int maxN,
Set<String> stopwords, Class<? extends Annotation> annotationClass)
throws TextClassificationException
{
final String ngramGlue = " ";
FrequencyDistribution<String> documentNgrams = new FrequencyDistribution<String>();
for (Sentence s : JCasUtil.select(jcas, Sentence.class)) {
List<String> strings = NGramUtils.valuesToText(jcas, s, annotationClass.getName());
for (List<String> ngram : new NGramStringListIterable(strings, minN, maxN)) {
if (lowerCaseNGrams) {
ngram = NGramUtils.lower(ngram);
}

if (NGramUtils.passesNgramFilter(ngram, stopwords, filterPartialMatches)) {
String ngramString = StringUtils.join(ngram, ngramGlue);
documentNgrams.inc(ngramString);
}
}
}
return documentNgrams;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
package eu.openminted.uc.socialsciences.variabledetection.resource;

import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;

import org.springframework.context.support.FileSystemXmlApplicationContext;

import de.tudarmstadt.ukp.dkpro.lexsemresource.core.ResourceFactory;
import de.tudarmstadt.ukp.dkpro.lexsemresource.exception.ResourceLoaderException;

/**
* Copied from {@link ResourceFactory}
*/
public class KnowledgeBaseFactory
{
public static final String ENV_DKPRO_HOME = "DKPRO_HOME";
public final static String CONFIG_FILE = "resources.xml";

private static KnowledgeBaseFactory loader;

private FileSystemXmlApplicationContext context;

public static synchronized KnowledgeBaseFactory getInstance() throws ResourceLoaderException
{
if (loader == null) {
List<String> locs = new ArrayList<String>();
URL resourceXmlUrl = null;

// Check in workspace
try {
File f = new File(getWorkspace(), CONFIG_FILE);
if (f.isFile()) {
try {
resourceXmlUrl = f.toURI().toURL();
}
catch (MalformedURLException e) {
throw new ResourceLoaderException(e);
}
}
locs.add(f.getAbsolutePath());
}
catch (IOException e) {
locs.add("DKPro workspace not available");
}

// Check in classpath
if (resourceXmlUrl == null) {
resourceXmlUrl = ResourceFactory.class.getResource(CONFIG_FILE);
locs.add("Classpath: " + CONFIG_FILE);
}

// Check in default file system location
if (resourceXmlUrl == null && new File(CONFIG_FILE).isFile()) {
try {
resourceXmlUrl = new File(CONFIG_FILE).toURI().toURL();
}
catch (MalformedURLException e) {
throw new ResourceLoaderException(e);
}
locs.add(new File(CONFIG_FILE).getAbsolutePath());
}

// Bail out if still not found
if (resourceXmlUrl == null) {
throw new ResourceLoaderException("Unable to locate configuration file ["
+ CONFIG_FILE + "] in " + locs.toString());
}

loader = new KnowledgeBaseFactory(resourceXmlUrl.toString());
}
return loader;
}

/**
* Constructor parameterized by the path to the configuration file.
*
* @param location
* location of the configuration file.
*/
public KnowledgeBaseFactory(String location)
{
context = new FileSystemXmlApplicationContext(location);
}

/**
* @return All registered resources. ResourceLoaderExceptions are catched and ignored to all for
* easy iteration over all resources runnalbe on the current system.
*/
public Collection<KnowledgeBaseResource> getAll()
{
return context.getBeansOfType(KnowledgeBaseResource.class).values();
}

/**
* Get the workspace directory.
*
* @return the workspace directory.
* @throws IOException
* if the workspace cannot be obtained
*/
private static File getWorkspace() throws IOException
{
if (System.getenv(ENV_DKPRO_HOME) != null) {
File f = new File(System.getenv(ENV_DKPRO_HOME));
return new File(f, ResourceFactory.class.getName());
}

throw new IOException("Environment variable [" + ENV_DKPRO_HOME + "] not set");
}

public KnowledgeBaseResource get(String name)
{
return (KnowledgeBaseResource) context.getBean(name, KnowledgeBaseResource.class);
}
}
Loading

0 comments on commit d1d5d34

Please sign in to comment.