-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
#52 - Implement feature extractors from GESIS paper
- implemented TheSoz feature extractor - added TheSoz feature extractor to TrainTestPipeline
- Loading branch information
Showing
7 changed files
with
452 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
115 changes: 115 additions & 0 deletions
115
.../main/java/eu/openminted/uc/socialsciences/variabledetection/features/TheSozFeatures.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
package eu.openminted.uc.socialsciences.variabledetection.features; | ||
|
||
import java.util.Arrays; | ||
import java.util.HashSet; | ||
import java.util.List; | ||
import java.util.Map; | ||
import java.util.Set; | ||
|
||
import org.apache.uima.fit.descriptor.ConfigurationParameter; | ||
import org.apache.uima.fit.descriptor.TypeCapability; | ||
import org.apache.uima.jcas.JCas; | ||
import org.apache.uima.resource.ResourceInitializationException; | ||
import org.apache.uima.resource.ResourceSpecifier; | ||
import org.dkpro.tc.api.exception.TextClassificationException; | ||
import org.dkpro.tc.api.features.Feature; | ||
import org.dkpro.tc.api.features.FeatureExtractor; | ||
import org.dkpro.tc.api.features.meta.MetaCollectorConfiguration; | ||
import org.dkpro.tc.api.type.TextClassificationTarget; | ||
import org.dkpro.tc.features.ngram.base.LuceneFeatureExtractorBase; | ||
|
||
import de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution; | ||
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; | ||
import de.tudarmstadt.ukp.dkpro.lexsemresource.exception.ResourceLoaderException; | ||
import eu.openminted.uc.socialsciences.variabledetection.resource.KnowledgeBaseFactory; | ||
import eu.openminted.uc.socialsciences.variabledetection.resource.KnowledgeBaseResource; | ||
import eu.openminted.uc.socialsciences.variabledetection.resource.TheSozResource; | ||
|
||
/** | ||
* Extracts features using TheSoz knowledge base | ||
*/ | ||
@TypeCapability(inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" }) | ||
public class TheSozFeatures | ||
extends LuceneFeatureExtractorBase | ||
implements FeatureExtractor | ||
{ | ||
public static final String PARAM_RESOURCE_NAME = "knowledgeBaseName"; | ||
@ConfigurationParameter(name = PARAM_RESOURCE_NAME, mandatory = true) | ||
protected String knowledgeBaseName; | ||
|
||
protected KnowledgeBaseResource kbr; | ||
|
||
@Override | ||
public boolean initialize(ResourceSpecifier aSpecifier, Map<String, Object> aAdditionalParams) | ||
throws ResourceInitializationException | ||
{ | ||
if (!super.initialize(aSpecifier, aAdditionalParams)) { | ||
return false; | ||
} | ||
|
||
try { | ||
kbr = KnowledgeBaseFactory.getInstance().get(knowledgeBaseName); | ||
} | ||
catch (ResourceLoaderException e) { | ||
throw new ResourceInitializationException(e); | ||
} | ||
return true; | ||
} | ||
|
||
@Override | ||
public Set<Feature> extract(JCas view, TextClassificationTarget target) | ||
throws TextClassificationException | ||
{ | ||
FrequencyDistribution<String> featureVector = new FrequencyDistribution<>(); | ||
// TODO parameterize max ngram size | ||
FrequencyDistribution<String> documentNgrams = TheSozMetaCollector.getDocumentNgrams(view, | ||
true, false, 1, 4, stopwords, Token.class); | ||
for (String ngram : documentNgrams.getKeys()) { | ||
// TODO language check | ||
if (kbr.containsConceptLabel(ngram)) { | ||
featureVector.inc(ngram); | ||
} | ||
} | ||
|
||
Set<Feature> features = new HashSet<Feature>(); | ||
for (String topNgram : topKSet.getKeys()) { | ||
if (featureVector.getKeys().contains(topNgram)) { | ||
features.add(new Feature(getFeaturePrefix() + "_" + topNgram, 1)); | ||
} | ||
else { | ||
features.add(new Feature(getFeaturePrefix() + "_" + topNgram, 0, true)); | ||
} | ||
} | ||
return features; | ||
} | ||
|
||
@Override | ||
public List<MetaCollectorConfiguration> getMetaCollectorClasses( | ||
Map<String, Object> parameterSettings) | ||
throws ResourceInitializationException | ||
{ | ||
return Arrays | ||
.asList(new MetaCollectorConfiguration(TheSozMetaCollector.class, parameterSettings) | ||
.addStorageMapping(TheSozMetaCollector.PARAM_TARGET_LOCATION, | ||
TheSozFeatures.PARAM_SOURCE_LOCATION, | ||
TheSozMetaCollector.LUCENE_DIR)); | ||
} | ||
|
||
@Override | ||
protected String getFieldName() | ||
{ | ||
return TheSozResource.NAME + featureExtractorName; | ||
} | ||
|
||
@Override | ||
protected int getTopN() | ||
{ | ||
return ngramUseTopK; | ||
} | ||
|
||
@Override | ||
protected String getFeaturePrefix() | ||
{ | ||
return "TheSoz-"; | ||
} | ||
} |
109 changes: 109 additions & 0 deletions
109
.../java/eu/openminted/uc/socialsciences/variabledetection/features/TheSozMetaCollector.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,109 @@ | ||
package eu.openminted.uc.socialsciences.variabledetection.features; | ||
|
||
import java.io.IOException; | ||
import java.util.List; | ||
import java.util.Set; | ||
|
||
import org.apache.commons.lang.StringUtils; | ||
import org.apache.uima.UimaContext; | ||
import org.apache.uima.fit.descriptor.ConfigurationParameter; | ||
import org.apache.uima.fit.util.JCasUtil; | ||
import org.apache.uima.jcas.JCas; | ||
import org.apache.uima.jcas.tcas.Annotation; | ||
import org.apache.uima.resource.ResourceInitializationException; | ||
import org.dkpro.tc.api.exception.TextClassificationException; | ||
import org.dkpro.tc.api.features.util.FeatureUtil; | ||
import org.dkpro.tc.features.ngram.meta.LuceneBasedMetaCollector; | ||
import org.dkpro.tc.features.ngram.util.NGramUtils; | ||
|
||
import de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution; | ||
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; | ||
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; | ||
import de.tudarmstadt.ukp.dkpro.core.ngrams.util.NGramStringListIterable; | ||
import de.tudarmstadt.ukp.dkpro.lexsemresource.exception.ResourceLoaderException; | ||
import eu.openminted.uc.socialsciences.variabledetection.resource.KnowledgeBaseFactory; | ||
import eu.openminted.uc.socialsciences.variabledetection.resource.KnowledgeBaseResource; | ||
import eu.openminted.uc.socialsciences.variabledetection.resource.TheSozResource; | ||
|
||
public class TheSozMetaCollector | ||
extends LuceneBasedMetaCollector | ||
{ | ||
public static final String PARAM_RESOURCE_NAME = "knowledgeBaseName"; | ||
@ConfigurationParameter(name = PARAM_RESOURCE_NAME, mandatory = true) | ||
protected String knowledgeBaseName; | ||
|
||
protected KnowledgeBaseResource kbr; | ||
|
||
public static final String PARAM_STOPWORDS_FILE = "stopwordsFile"; | ||
@ConfigurationParameter(name = PARAM_STOPWORDS_FILE, mandatory = false) | ||
private String ngramStopwordsFile; | ||
|
||
private Set<String> stopwords; | ||
|
||
@Override | ||
public void initialize(UimaContext context) throws ResourceInitializationException | ||
{ | ||
super.initialize(context); | ||
|
||
try { | ||
stopwords = FeatureUtil.getStopwords(ngramStopwordsFile, false); | ||
} | ||
catch (IOException e) { | ||
throw new ResourceInitializationException(e); | ||
} | ||
try { | ||
kbr = KnowledgeBaseFactory.getInstance().get(TheSozResource.NAME); | ||
} | ||
catch (ResourceLoaderException e) { | ||
throw new ResourceInitializationException(e); | ||
} | ||
} | ||
|
||
@Override | ||
protected FrequencyDistribution<String> getNgramsFD(JCas jcas) | ||
throws TextClassificationException | ||
{ | ||
FrequencyDistribution<String> frequencyDistribution = new FrequencyDistribution<>(); | ||
|
||
//TODO parameterize max ngram size | ||
FrequencyDistribution<String> documentNgrams = getDocumentNgrams(jcas, true, false, 1, 4, | ||
stopwords, Token.class); | ||
for (String ngram : documentNgrams.getKeys()) { | ||
// TODO language check | ||
if (kbr.containsConceptLabel(ngram)) { | ||
frequencyDistribution.inc(ngram); | ||
} | ||
} | ||
|
||
return frequencyDistribution; | ||
} | ||
|
||
@Override | ||
protected String getFieldName() | ||
{ | ||
return "thesoz" + featureExtractorName; | ||
} | ||
|
||
public static FrequencyDistribution<String> getDocumentNgrams(JCas jcas, | ||
boolean lowerCaseNGrams, boolean filterPartialMatches, int minN, int maxN, | ||
Set<String> stopwords, Class<? extends Annotation> annotationClass) | ||
throws TextClassificationException | ||
{ | ||
final String ngramGlue = " "; | ||
FrequencyDistribution<String> documentNgrams = new FrequencyDistribution<String>(); | ||
for (Sentence s : JCasUtil.select(jcas, Sentence.class)) { | ||
List<String> strings = NGramUtils.valuesToText(jcas, s, annotationClass.getName()); | ||
for (List<String> ngram : new NGramStringListIterable(strings, minN, maxN)) { | ||
if (lowerCaseNGrams) { | ||
ngram = NGramUtils.lower(ngram); | ||
} | ||
|
||
if (NGramUtils.passesNgramFilter(ngram, stopwords, filterPartialMatches)) { | ||
String ngramString = StringUtils.join(ngram, ngramGlue); | ||
documentNgrams.inc(ngramString); | ||
} | ||
} | ||
} | ||
return documentNgrams; | ||
} | ||
} |
120 changes: 120 additions & 0 deletions
120
...java/eu/openminted/uc/socialsciences/variabledetection/resource/KnowledgeBaseFactory.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,120 @@ | ||
package eu.openminted.uc.socialsciences.variabledetection.resource; | ||
|
||
import java.io.File; | ||
import java.io.IOException; | ||
import java.net.MalformedURLException; | ||
import java.net.URL; | ||
import java.util.ArrayList; | ||
import java.util.Collection; | ||
import java.util.List; | ||
|
||
import org.springframework.context.support.FileSystemXmlApplicationContext; | ||
|
||
import de.tudarmstadt.ukp.dkpro.lexsemresource.core.ResourceFactory; | ||
import de.tudarmstadt.ukp.dkpro.lexsemresource.exception.ResourceLoaderException; | ||
|
||
/** | ||
* Copied from {@link ResourceFactory} | ||
*/ | ||
public class KnowledgeBaseFactory | ||
{ | ||
public static final String ENV_DKPRO_HOME = "DKPRO_HOME"; | ||
public final static String CONFIG_FILE = "resources.xml"; | ||
|
||
private static KnowledgeBaseFactory loader; | ||
|
||
private FileSystemXmlApplicationContext context; | ||
|
||
public static synchronized KnowledgeBaseFactory getInstance() throws ResourceLoaderException | ||
{ | ||
if (loader == null) { | ||
List<String> locs = new ArrayList<String>(); | ||
URL resourceXmlUrl = null; | ||
|
||
// Check in workspace | ||
try { | ||
File f = new File(getWorkspace(), CONFIG_FILE); | ||
if (f.isFile()) { | ||
try { | ||
resourceXmlUrl = f.toURI().toURL(); | ||
} | ||
catch (MalformedURLException e) { | ||
throw new ResourceLoaderException(e); | ||
} | ||
} | ||
locs.add(f.getAbsolutePath()); | ||
} | ||
catch (IOException e) { | ||
locs.add("DKPro workspace not available"); | ||
} | ||
|
||
// Check in classpath | ||
if (resourceXmlUrl == null) { | ||
resourceXmlUrl = ResourceFactory.class.getResource(CONFIG_FILE); | ||
locs.add("Classpath: " + CONFIG_FILE); | ||
} | ||
|
||
// Check in default file system location | ||
if (resourceXmlUrl == null && new File(CONFIG_FILE).isFile()) { | ||
try { | ||
resourceXmlUrl = new File(CONFIG_FILE).toURI().toURL(); | ||
} | ||
catch (MalformedURLException e) { | ||
throw new ResourceLoaderException(e); | ||
} | ||
locs.add(new File(CONFIG_FILE).getAbsolutePath()); | ||
} | ||
|
||
// Bail out if still not found | ||
if (resourceXmlUrl == null) { | ||
throw new ResourceLoaderException("Unable to locate configuration file [" | ||
+ CONFIG_FILE + "] in " + locs.toString()); | ||
} | ||
|
||
loader = new KnowledgeBaseFactory(resourceXmlUrl.toString()); | ||
} | ||
return loader; | ||
} | ||
|
||
/** | ||
* Constructor parameterized by the path to the configuration file. | ||
* | ||
* @param location | ||
* location of the configuration file. | ||
*/ | ||
public KnowledgeBaseFactory(String location) | ||
{ | ||
context = new FileSystemXmlApplicationContext(location); | ||
} | ||
|
||
/** | ||
* @return All registered resources. ResourceLoaderExceptions are catched and ignored to all for | ||
* easy iteration over all resources runnalbe on the current system. | ||
*/ | ||
public Collection<KnowledgeBaseResource> getAll() | ||
{ | ||
return context.getBeansOfType(KnowledgeBaseResource.class).values(); | ||
} | ||
|
||
/** | ||
* Get the workspace directory. | ||
* | ||
* @return the workspace directory. | ||
* @throws IOException | ||
* if the workspace cannot be obtained | ||
*/ | ||
private static File getWorkspace() throws IOException | ||
{ | ||
if (System.getenv(ENV_DKPRO_HOME) != null) { | ||
File f = new File(System.getenv(ENV_DKPRO_HOME)); | ||
return new File(f, ResourceFactory.class.getName()); | ||
} | ||
|
||
throw new IOException("Environment variable [" + ENV_DKPRO_HOME + "] not set"); | ||
} | ||
|
||
public KnowledgeBaseResource get(String name) | ||
{ | ||
return (KnowledgeBaseResource) context.getBean(name, KnowledgeBaseResource.class); | ||
} | ||
} |
Oops, something went wrong.