Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(model): Add linearsvc agent #102

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,9 @@ Get the help by running `atarashi -h` or `atarashi --help`
- Running **wordFrequencySimilarity** agent

`atarashi -a wordFrequencySimilarity /path/to/file.c`
- Running **linearsvc** agent

`atarashi -a linearsvc /path/to/file.c`
- Running **tfidf** agent
- With **Cosine similarity**

Expand Down
90 changes: 90 additions & 0 deletions atarashi/agents/linearsvc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-

"""
Copyright 2022 Sushant Kumar ([email protected])
SPDX-License-Identifier: GPL-2.0
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
version 2 as published by the Free Software Foundation.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
"""

__author__ = 'Sushant Kumar'
__email__ = '[email protected]'

import argparse

from atarashi.agents.atarashiAgent import AtarashiAgent
from atarashi.libs.initialmatch import spdx_identifer
from linearsvc import linearsvc


class Linearsvc(AtarashiAgent):

def __init__(self, licenseList):
super().__init__(licenseList)

def predict_shortname(self, processed_comment):
'''
:param filePath: extracted and preprocessed comment
:return: Returns the predicted license's short name
'''

processed_comment = [processed_comment]
classifier = linearsvc(processed_comment)
predictor = classifier.classify()
return predictor.predict(processed_comment)

def scan(self, filePath):
'''
Read the content of filename, extract the comments and preprocess them.
Find the predicted short name for the preprocessed file.
:param filePath: Path of the file to scan
:return: Returns the license's short name
'''

match = []

with open(filePath) as file:
raw_data = file.read()

spdx_identifers = spdx_identifer(raw_data,
self.licenseList['shortname'])
if spdx_identifers:
match.extend(spdx_identifers)
else:
processed_comment = super().loadFile(filePath)
license_name = self.predict_shortname(processed_comment)

match.append({
'shortname': str(license_name[0]),
'sim_score': 1.0,
'sim_type': 'linearsvc',
'description': '',
})
return match


if __name__ == '__main__':

parser = argparse.ArgumentParser()
parser.add_argument('processedLicenseList',
help='Specify the processed license list file')
parser.add_argument('inputFile',
help='Specify the input file which needs to be scanned'
)

args = parser.parse_args()

licenseList = args.processedLicenseList
filename = args.inputFile

scanner = Linearsvc(licenseList)
scanner.scan(filename)
5 changes: 4 additions & 1 deletion atarashi/atarashii.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from atarashi.agents.dameruLevenDist import DameruLevenDist
from atarashi.agents.tfidf import TFIDF
from atarashi.agents.wordFrequencySimilarity import WordFrequencySimilarity
from atarashi.agents.linearsvc import Linearsvc

__author__ = "Aman Jain"
__email__ = "[email protected]"
Expand Down Expand Up @@ -78,6 +79,8 @@ def build_scanner_obj(processedLicense, agent_name, similarity="CosineSim",
scanner = WordFrequencySimilarity(processedLicense)
elif agent_name == "DLD":
scanner = DameruLevenDist(processedLicense)
elif agent_name == "linearsvc":
scanner = Linearsvc(processedLicense)
elif agent_name == "tfidf":
scanner = TFIDF(processedLicense)
if similarity == "CosineSim":
Expand Down Expand Up @@ -128,7 +131,7 @@ def main():
parser.add_argument("-l", "--processedLicenseList", required=False,
help="Specify the location of processed license list file")
parser.add_argument("-a", "--agent_name", required=True,
choices=['wordFrequencySimilarity', 'DLD', 'tfidf', 'Ngram'],
choices=['wordFrequencySimilarity', 'DLD', 'tfidf', 'Ngram', 'linearsvc'],
help="Name of the agent that needs to be run")
parser.add_argument("-s", "--similarity", required=False, default="CosineSim",
choices=["ScoreSim", "CosineSim", "DiceSim", "BigramCosineSim"],
Expand Down
2 changes: 1 addition & 1 deletion atarashi/evaluator/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ def evaluate(scanner):
defaultJSON = resource_filename("atarashi", "data/Ngram_keywords.json")
parser = argparse.ArgumentParser()
parser.add_argument("-a", "--agent_name", required=True,
choices=['wordFrequencySimilarity', 'DLD', 'tfidf', 'Ngram'],
choices=['wordFrequencySimilarity', 'DLD', 'tfidf', 'Ngram', 'linearsvc'],
help="Name of the agent that needs to be run")
parser.add_argument("-s", "--similarity", required=False, default="CosineSim",
choices=["ScoreSim", "CosineSim", "DiceSim", "BigramCosineSim"],
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,6 @@ requires = [
"textdistance>=3.0.3",
"pyxDamerauLevenshtein>=1.5",
"nirjas>=0.0.5",
"urllib3>=1.24.1"
"urllib3>=1.24.1",
"linearsvc>=0.1.1"
]
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ textdistance>=3.0.3
setuptools>=39.2.0
nirjas>=0.0.5
urllib3>=1.24.1
linearsvc>=0.1.1
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,8 @@ def read(fname):
'textdistance>=3.0.3',
'pyxDamerauLevenshtein>=1.5',
'urllib3>=1.24.1',
'nirjas>=0.0.5'
'nirjas>=0.0.5',
'linearsvc>=0.1.1'
]

class BuildAtarashiDependencies(distutils.cmd.Command):
Expand Down