Skip to content

Commit

Permalink
#52 - Implement feature extractors from GESIS paper
Browse files Browse the repository at this point in the history
- added lemma Ngram feature extractor
  • Loading branch information
maxxkia committed Sep 1, 2017
1 parent 69c186a commit 178422a
Showing 1 changed file with 71 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
/*******************************************************************************
* Copyright 2017
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
package eu.openminted.uc.socialsciences.variabledetection.features;

import java.util.HashSet;
import java.util.Set;

import org.apache.uima.fit.descriptor.TypeCapability;
import org.apache.uima.jcas.JCas;
import org.dkpro.tc.api.exception.TextClassificationException;
import org.dkpro.tc.api.features.Feature;
import org.dkpro.tc.api.features.FeatureExtractor;
import org.dkpro.tc.api.type.TextClassificationTarget;
import org.dkpro.tc.features.ngram.LuceneNGram;
import org.dkpro.tc.features.ngram.util.NGramUtils;

import de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma;

/**
* Extracts token n-grams within the given text classification unit
*/
@TypeCapability(inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence",
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma" })
public class LuceneLemmaNGram
extends LuceneNGram
implements FeatureExtractor
{

@Override
public Set<Feature> extract(JCas jcas, TextClassificationTarget target)
throws TextClassificationException
{
Set<Feature> features = new HashSet<Feature>();
FrequencyDistribution<String> documentNgrams = null;

documentNgrams = NGramUtils.getDocumentNgrams(jcas, target, ngramLowerCase,
filterPartialStopwordMatches, ngramMinN, ngramMaxN, stopwords, Lemma.class);

for (String topNgram : topKSet.getKeys()) {
if (documentNgrams.getKeys().contains(topNgram)) {
features.add(new Feature(getFeaturePrefix() + "_" + topNgram, 1));
}
else {
features.add(new Feature(getFeaturePrefix() + "_" + topNgram, 0, true));
}
}
return features;
}

@Override
protected String getFeaturePrefix()
{
return "lemma-ngram";
}
}

0 comments on commit 178422a

Please sign in to comment.