Skip to content

Commit

Permalink
#52 - Implement feature extractors from GESIS paper
Browse files Browse the repository at this point in the history
implemented wordnet feature extractor
  • Loading branch information
maxxkia committed Sep 6, 2017
1 parent 75eac5a commit cf9ab3d
Show file tree
Hide file tree
Showing 3 changed files with 202 additions and 23 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -116,26 +116,27 @@ public static ParameterSpace getParameterSpace()
TcFeatureFactory.create(LuceneSkipNGram.class,
LuceneSkipNGram.PARAM_NGRAM_USE_TOP_K, 50, LuceneSkipNGram.PARAM_NGRAM_MIN_N, 2,
LuceneSkipNGram.PARAM_NGRAM_MAX_N, 3),
TcFeatureFactory.create(NEFeatureExtractor.class)/*,
TcFeatureFactory.create(NEFeatureExtractor.class),
TcFeatureFactory.create(WordnetFeatures.class,
WordnetFeatures.PARAM_RESOURCE_NAME, "wordnet",
WordnetFeatures.PARAM_RESOURCE_LANGUAGE, "en",
WordnetFeatures.PARAM_SYNONYM_FEATURE, true,
WordnetFeatures.PARAM_HYPERNYM_FEATURE, true)*/));
WordnetFeatures.PARAM_HYPERNYM_FEATURE, false)));

// single-label feature selection (Weka specific options), reduces the feature set to 10
Map<String, Object> dimFeatureSelection = new HashMap<String, Object>();
dimFeatureSelection.put(DIM_FEATURE_SEARCHER_ARGS,
asList(new String[] { Ranker.class.getName(), "-N", "10" }));
asList(new String[] { Ranker.class.getName(), "-N", "100" }));
dimFeatureSelection.put(DIM_ATTRIBUTE_EVALUATOR_ARGS,
asList(new String[] { InfoGainAttributeEval.class.getName() }));
dimFeatureSelection.put(DIM_APPLY_FEATURE_SELECTION, true);

ParameterSpace pSpace = new ParameterSpace(Dimension.createBundle("readers", dimReaders),
Dimension.create(DIM_LEARNING_MODE, LM_SINGLE_LABEL),
Dimension.create(DIM_FEATURE_MODE, FM_DOCUMENT), dimFeatureSets,
dimClassificationArgs,
Dimension.createBundle("featureSelection", dimFeatureSelection));
dimClassificationArgs
,Dimension.createBundle("featureSelection", dimFeatureSelection)
);

return pSpace;
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
package eu.openminted.uc.socialsciences.variabledetection.features;

import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;

import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.TypeCapability;
Expand All @@ -14,14 +15,16 @@
import org.dkpro.tc.api.exception.TextClassificationException;
import org.dkpro.tc.api.features.Feature;
import org.dkpro.tc.api.features.FeatureExtractor;
import org.dkpro.tc.api.features.FeatureExtractorResource_ImplBase;
import org.dkpro.tc.api.features.meta.MetaCollectorConfiguration;
import org.dkpro.tc.api.type.TextClassificationTarget;
import org.dkpro.tc.features.ngram.base.LuceneFeatureExtractorBase;

import de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import de.tudarmstadt.ukp.dkpro.lexsemresource.Entity;
import de.tudarmstadt.ukp.dkpro.lexsemresource.Entity.PoS;
import de.tudarmstadt.ukp.dkpro.lexsemresource.LexicalSemanticResource;
import de.tudarmstadt.ukp.dkpro.lexsemresource.LexicalSemanticResource.SemanticRelation;
import de.tudarmstadt.ukp.dkpro.lexsemresource.core.ResourceFactory;
import de.tudarmstadt.ukp.dkpro.lexsemresource.exception.LexicalSemanticResourceException;
import de.tudarmstadt.ukp.dkpro.lexsemresource.exception.ResourceLoaderException;
Expand All @@ -30,9 +33,9 @@
* Extracts features using wordnet i.e. entity id, synonyms, hypernyms
*/
@TypeCapability(inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token",
"de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos"})
"de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos" })
public class WordnetFeatures
extends FeatureExtractorResource_ImplBase
extends LuceneFeatureExtractorBase
implements FeatureExtractor
{
public static final String PARAM_RESOURCE_NAME = "LsrResourceName";
Expand All @@ -57,6 +60,8 @@ public class WordnetFeatures
@ConfigurationParameter(name = PARAM_DERIVATION_FEATURE, defaultValue = "false", mandatory = false)
private boolean derivativeFeatures;

public static final String WORDNET_FIELD = "wordnet";

@Override
public boolean initialize(ResourceSpecifier aSpecifier, Map<String, Object> aAdditionalParams)
throws ResourceInitializationException
Expand All @@ -78,11 +83,10 @@ public boolean initialize(ResourceSpecifier aSpecifier, Map<String, Object> aAdd
public Set<Feature> extract(JCas view, TextClassificationTarget target)
throws TextClassificationException
{
Set<Feature> featureList = new TreeSet<Feature>();

FrequencyDistribution<String> featureVector = new FrequencyDistribution<>();
List<Token> tokens = JCasUtil.selectCovered(view, Token.class, target);
for (Token token : tokens) {
String lexeme = token.getCoveredText().toLowerCase();
PoS pos = null;
switch (token.getPos().getCoarseValue()) {
case "ADJ":
Expand All @@ -102,20 +106,30 @@ public Set<Feature> extract(JCas view, TextClassificationTarget target)
continue;

try {
Set<Entity> foundEntities = lsr.getEntity(token.getCoveredText(), pos);
Set<Entity> foundEntities = lsr.getEntity(lexeme, pos);
for (Entity entity : foundEntities) {
featureVector.inc(entity.getId());

//Synonyms
if (synonymFeatures) {
Set<Entity> neighbors = lsr.getNeighbors(entity);
for (Entity nEntity : neighbors) {
featureVector.inc(nEntity.getId());
}
if (entity.getSense(lexeme) != null) {
Set<String> synonyms = lsr.getRelatedLexemes(lexeme, pos,
entity.getSense(lexeme),
LexicalSemanticResource.LexicalRelation.synonymy);
for (String synonym : synonyms) {
Set<Entity> synonymEntities = lsr.getEntity(synonym, pos);
for (Entity nEntity : synonymEntities) {
featureVector.inc(nEntity.getId());
}
}
}
}

//Hypernyms
if (hypernymFeatures) {
Set<Entity> parents = lsr.getParents(entity);
for (Entity pEntity : parents) {
Set<Entity> hypernyms = lsr.getRelatedEntities(entity,
SemanticRelation.hypernymy);
for (Entity pEntity : hypernyms) {
featureVector.inc(pEntity.getId());
}
}
Expand All @@ -125,15 +139,45 @@ public Set<Feature> extract(JCas view, TextClassificationTarget target)
throw new IllegalStateException("Method not supported by LSR!", e);
}
}

for (String key : featureVector.getKeys()) {
featureList.add(new Feature(getFeaturePrefix() + key, 1));

Set<Feature> features = new HashSet<Feature>();
for (String topNgram : topKSet.getKeys()) {
if (featureVector.getKeys().contains(topNgram)) {
features.add(new Feature(getFeaturePrefix() + "_" + topNgram, 1));
}
else {
features.add(new Feature(getFeaturePrefix() + "_" + topNgram, 0, true));
}
}
return features;
}

return featureList;
@Override
public List<MetaCollectorConfiguration> getMetaCollectorClasses(
Map<String, Object> parameterSettings)
throws ResourceInitializationException
{
return Arrays.asList(
new MetaCollectorConfiguration(WordnetMetaCollector.class, parameterSettings)
.addStorageMapping(WordnetMetaCollector.PARAM_TARGET_LOCATION,
WordnetFeatures.PARAM_SOURCE_LOCATION,
WordnetMetaCollector.LUCENE_DIR));
}

private String getFeaturePrefix()
@Override
protected String getFieldName()
{
return WORDNET_FIELD + featureExtractorName;
}

@Override
protected int getTopN()
{
return ngramUseTopK;
}

@Override
protected String getFeaturePrefix()
{
return "wordnet-";
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
package eu.openminted.uc.socialsciences.variabledetection.features;

import java.io.IOException;
import java.util.Collection;
import java.util.Set;

import org.apache.uima.UimaContext;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.dkpro.tc.api.exception.TextClassificationException;
import org.dkpro.tc.api.features.util.FeatureUtil;
import org.dkpro.tc.features.ngram.meta.LuceneBasedMetaCollector;

import de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import de.tudarmstadt.ukp.dkpro.lexsemresource.Entity;
import de.tudarmstadt.ukp.dkpro.lexsemresource.LexicalSemanticResource;
import de.tudarmstadt.ukp.dkpro.lexsemresource.Entity.PoS;
import de.tudarmstadt.ukp.dkpro.lexsemresource.LexicalSemanticResource.SemanticRelation;
import de.tudarmstadt.ukp.dkpro.lexsemresource.core.ResourceFactory;
import de.tudarmstadt.ukp.dkpro.lexsemresource.exception.LexicalSemanticResourceException;
import de.tudarmstadt.ukp.dkpro.lexsemresource.exception.ResourceLoaderException;

public class WordnetMetaCollector
extends LuceneBasedMetaCollector
{
public static final String PARAM_RESOURCE_NAME = "LsrResourceName";
@ConfigurationParameter(name = PARAM_RESOURCE_NAME, mandatory = true)
protected String lsrResourceName;

public static final String PARAM_RESOURCE_LANGUAGE = "LSRResourceLanguage";
@ConfigurationParameter(name = PARAM_RESOURCE_LANGUAGE, mandatory = true)
protected String lsrResourceLanguage;

protected LexicalSemanticResource lsr;

public static final String PARAM_STOPWORDS_FILE = "stopwordsFile";
@ConfigurationParameter(name = PARAM_STOPWORDS_FILE, mandatory = false)
private String ngramStopwordsFile;

private Set<String> stopwords;

@Override
public void initialize(UimaContext context) throws ResourceInitializationException
{
super.initialize(context);

try {
stopwords = FeatureUtil.getStopwords(ngramStopwordsFile, false);
}
catch (IOException e) {
throw new ResourceInitializationException(e);
}
try {
lsr = ResourceFactory.getInstance().get(lsrResourceName, lsrResourceLanguage);
}
catch (ResourceLoaderException e) {
throw new ResourceInitializationException(e);
}
}

@Override
protected FrequencyDistribution<String> getNgramsFD(JCas jcas)
throws TextClassificationException
{
FrequencyDistribution<String> frequencyDistribution = new FrequencyDistribution<>();
Collection<Token> tokens = JCasUtil.select(jcas, Token.class);
for (Token token : tokens) {
String lexeme = token.getCoveredText().toLowerCase();
if (stopwords.contains(lexeme))
continue;

PoS pos = null;
switch (token.getPos().getCoarseValue()) {
case "ADJ":
pos = PoS.adj;
break;
case "ADV":
pos = PoS.adv;
break;
case "N":
pos = PoS.n;
break;
case "V":
pos = PoS.v;
break;
}
if (pos == null)
continue;

try {
Set<Entity> foundEntities = lsr.getEntity(lexeme, pos);
for (Entity entity : foundEntities) {
frequencyDistribution.inc(entity.getId());

// Synonyms
if (entity.getSense(lexeme) != null) {
Set<String> synonyms = lsr.getRelatedLexemes(lexeme, pos,
entity.getSense(lexeme),
LexicalSemanticResource.LexicalRelation.synonymy);
for (String synonym : synonyms) {
Set<Entity> synonymEntities = lsr.getEntity(synonym, pos);
for (Entity nEntity : synonymEntities) {
frequencyDistribution.inc(nEntity.getId());
}
}
}

// Hypernyms
Set<Entity> hypernyms = lsr.getRelatedEntities(entity,
SemanticRelation.hypernymy);
for (Entity pEntity : hypernyms) {
frequencyDistribution.inc(pEntity.getId());
}

}
}
catch (LexicalSemanticResourceException e) {
throw new IllegalStateException("Method not supported by LSR!", e);
}
}

return frequencyDistribution;
}

@Override
protected String getFieldName()
{
return WordnetFeatures.WORDNET_FIELD + featureExtractorName;
}

}

0 comments on commit cf9ab3d

Please sign in to comment.