Skip to content

Commit

Permalink
Merge pull request fossology#2073 from siemens/feat/monkbulk/clean-text
Browse files Browse the repository at this point in the history
feat(monkbulk): New and custom delimiters

Reviewed-by: [email protected]
Tested-by: [email protected]
  • Loading branch information
ag4ums authored Oct 20, 2021
2 parents 357f068 + 9b474be commit 4d855cf
Show file tree
Hide file tree
Showing 18 changed files with 250 additions and 57 deletions.
4 changes: 2 additions & 2 deletions src/decider/agent/BulkReuser.php
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,8 @@ public function rerunBulkAndDeciderOnUpload($uploadId, $groupId, $userId, $bulkI
*/
$deciderPlugin = plugin_find("agent_deciderjob");
$dependecies = array();
$sql = "INSERT INTO license_ref_bulk (user_fk,group_fk,rf_text,upload_fk,uploadtree_fk,ignore_irrelevant) "
. "SELECT $1 AS user_fk, $2 AS group_fk,rf_text,$3 AS upload_fk, $4 as uploadtree_fk, ignore_irrelevant
$sql = "INSERT INTO license_ref_bulk (user_fk,group_fk,rf_text,upload_fk,uploadtree_fk,ignore_irrelevant,bulk_delimiters) "
. "SELECT $1 AS user_fk, $2 AS group_fk,rf_text,$3 AS upload_fk, $4 as uploadtree_fk, ignore_irrelevant, bulk_delimiters
FROM license_ref_bulk WHERE lrb_pk=$5 RETURNING lrb_pk, $5 as lrb_origin";
$sqlLic = "INSERT INTO license_set_bulk (lrb_fk, rf_fk, removing, comment, reportinfo, acknowledgement) "
."SELECT $1 as lrb_fk, rf_fk, removing, comment, reportinfo, acknowledgement FROM license_set_bulk WHERE lrb_fk=$2";
Expand Down
16 changes: 12 additions & 4 deletions src/lib/php/Dao/LicenseDao.php
Original file line number Diff line number Diff line change
Expand Up @@ -551,16 +551,24 @@ public function getLicenseByShortName($licenseShortname, $groupId=null)
* @param bool[] $licenseRemovals
* @param string $refText
* @param bool $ignoreIrrelevant Ignore irrelevant files while scanning
* @param string $delimiters Delimiters for bulk scan,
* null or "DEFAULT" for default values
* @return int lrp_pk on success or -1 on fail
*/
public function insertBulkLicense($userId, $groupId, $uploadTreeId, $licenseRemovals, $refText, $ignoreIrrelevant=true)
public function insertBulkLicense($userId, $groupId, $uploadTreeId, $licenseRemovals, $refText, $ignoreIrrelevant=true, $delimiters=null)
{
if (strcasecmp($delimiters, "DEFAULT") === 0) {
$delimiters = null;
} elseif ($delimiters !== null) {
$delimiters = StringOperation::replaceUnicodeControlChar($delimiters);
}
$licenseRefBulkIdResult = $this->dbManager->getSingleRow(
"INSERT INTO license_ref_bulk (user_fk, group_fk, uploadtree_fk, rf_text, ignore_irrelevant)
VALUES ($1,$2,$3,$4,$5) RETURNING lrb_pk",
"INSERT INTO license_ref_bulk (user_fk, group_fk, uploadtree_fk, rf_text, ignore_irrelevant, bulk_delimiters)
VALUES ($1,$2,$3,$4,$5,$6) RETURNING lrb_pk",
array($userId, $groupId, $uploadTreeId,
StringOperation::replaceUnicodeControlChar($refText),
$this->dbManager->booleanToDb($ignoreIrrelevant)),
$this->dbManager->booleanToDb($ignoreIrrelevant),
$delimiters),
__METHOD__ . '.getLrb'
);
if ($licenseRefBulkIdResult === false) {
Expand Down
6 changes: 3 additions & 3 deletions src/monk/agent/match.c
Original file line number Diff line number Diff line change
Expand Up @@ -105,15 +105,15 @@ static char* getFileName(MonkState* state, long pFileId) {
return pFileName;
}

int matchPFileWithLicenses(MonkState* state, long pFileId, const Licenses* licenses, const MatchCallbacks* callbacks) {
int matchPFileWithLicenses(MonkState* state, long pFileId, const Licenses* licenses, const MatchCallbacks* callbacks, char* delimiters) {
File file;
file.id = pFileId;
int result = 0;

file.fileName = getFileName(state, pFileId);

if (file.fileName != NULL) {
result = readTokensFromFile(file.fileName, &(file.tokens), DELIMITERS);
result = readTokensFromFile(file.fileName, &(file.tokens), delimiters);

if (result) {
result = matchFileWithLicenses(state, &file, licenses, callbacks);
Expand Down
2 changes: 1 addition & 1 deletion src/monk/agent/match.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ size_t match_getEnd(const Match* match);

GArray* findAllMatchesBetween(const File* file, const Licenses* licenses, unsigned maxAllowedDiff, unsigned minAdjacentMatches, unsigned maxLeadingDiff);

int matchPFileWithLicenses(MonkState* state, long pFileId, const Licenses* licenses, const MatchCallbacks* callbacks);
int matchPFileWithLicenses(MonkState* state, long pFileId, const Licenses* licenses, const MatchCallbacks* callbacks, char* delimiters);
int matchFileWithLicenses(MonkState* state, const File* file, const Licenses* licenses, const MatchCallbacks* callbacks);

void findDiffMatches(const File* file, const License* license,
Expand Down
2 changes: 1 addition & 1 deletion src/monk/agent/monk.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ with this program; if not, write to the Free Software Foundation, Inc.,
#define DIFF_TYPE_REMOVAL "M-"
#define DIFF_TYPE_REPLACE "MR"

#define DELIMITERS " \t\n\r\f#^%"
#define DELIMITERS " \t\n\r\f#^%,*"

#define MONK_CASE_INSENSITIVE
#define MAX_ALLOWED_DIFF_LENGTH 256
Expand Down
17 changes: 14 additions & 3 deletions src/monk/agent/monkbulk.c
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ int queryBulkArguments(MonkState* state, long bulkId) {
state->dbManager,
"queryBulkArguments",
"SELECT ut.upload_fk, ut.uploadtree_pk, lrb.user_fk, lrb.group_fk, "
"lrb.rf_text, lrb.ignore_irrelevant "
"lrb.rf_text, lrb.ignore_irrelevant, lrb.bulk_delimiters "
"FROM license_ref_bulk lrb INNER JOIN uploadtree ut "
"ON ut.uploadtree_pk = lrb.uploadtree_fk "
"WHERE lrb_pk = $1",
Expand All @@ -105,6 +105,15 @@ int queryBulkArguments(MonkState* state, long bulkId) {
bulkArguments->groupId = atoi(PQgetvalue(bulkArgumentsResult, 0, column++));
bulkArguments->refText = g_strdup(PQgetvalue(bulkArgumentsResult, 0, column++));
bulkArguments->ignoreIrre = strcmp(PQgetvalue(bulkArgumentsResult, 0, column++), "t") == 0;
if (PQgetisnull(bulkArgumentsResult, 0, column) == 1)
{
bulkArguments->delimiters = g_strdup(DELIMITERS);
column++;
}
else
{
bulkArguments->delimiters = normalize_escape_string(PQgetvalue(bulkArgumentsResult, 0, column++));
}
bulkArguments->bulkId = bulkId;
bulkArguments->actions = queryBulkActions(state, bulkId);
bulkArguments->jobId = fo_scheduler_jobId();
Expand Down Expand Up @@ -169,6 +178,7 @@ void bulkArguments_contents_free(BulkArguments* bulkArguments) {
free(bulkActions);

g_free(bulkArguments->refText);
g_free(bulkArguments->delimiters);

free(bulkArguments);
}
Expand All @@ -179,7 +189,7 @@ int bulk_identification(MonkState* state) {
License license = (License){
.refId = bulkArguments->licenseId,
};
license.tokens = tokenize(bulkArguments->refText, DELIMITERS);
license.tokens = tokenize(bulkArguments->refText, bulkArguments->delimiters);

GArray* licenseArray = g_array_new(FALSE, FALSE, sizeof (License));
g_array_append_val(licenseArray, license);
Expand Down Expand Up @@ -217,7 +227,8 @@ int bulk_identification(MonkState* state) {

long fileId = atol(PQgetvalue(filesResult, i, 0));

if (matchPFileWithLicenses(threadLocalState, fileId, licenses, &bulkCallbacks)) {
if (matchPFileWithLicenses(threadLocalState, fileId, licenses,
&bulkCallbacks, bulkArguments->delimiters)) {
fo_scheduler_heart(1);
} else {
fo_scheduler_heart(0);
Expand Down
1 change: 1 addition & 0 deletions src/monk/agent/monkbulk.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ typedef struct {
int groupId;
char* refText;
bool ignoreIrre;
char* delimiters;
BulkAction** actions;
} BulkArguments;

Expand Down
2 changes: 1 addition & 1 deletion src/monk/agent/scheduler.c
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ int processUploadId(MonkState* state, int uploadId, const Licenses* licenses) {
continue;
}

if (matchPFileWithLicenses(threadLocalState, pFileId, licenses, &schedulerCallbacks)) {
if (matchPFileWithLicenses(threadLocalState, pFileId, licenses, &schedulerCallbacks, DELIMITERS)) {
fo_scheduler_heart(1);
} else {
fo_scheduler_heart(0);
Expand Down
87 changes: 86 additions & 1 deletion src/monk/agent/string_operations.c
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ with this program; if not, write to the Free Software Foundation, Inc.,
#include "monk.h"

#define MAX_TOKENS_ARRAY_SIZE 4194304
#define MAX_DELIMIT_LEN 255

unsigned splittingDelim(char a, const char* delimiters) {
if (a == '\0')
Expand All @@ -43,9 +44,10 @@ unsigned splittingDelim(char a, const char* delimiters) {
}

unsigned specialDelim(const char* z){
char a, b;
char a, b, c;
a = *z;
b = *(z+1);
c = *(z+2);
if( a=='/') {
if (b=='/' || b=='*')
return 2;
Expand All @@ -58,6 +60,13 @@ unsigned specialDelim(const char* z){
else if( a==':' && b==':') {
return 2;
}
else if ((a==b && b==c) && (a=='"' || a=='\'')) {
return 3;
}
else if (a=='d' && b=='n' && c=='l') {
// dnl comments
return 3;
}
return 0;
}

Expand Down Expand Up @@ -221,3 +230,79 @@ size_t token_position_of(size_t index, const GArray* tokens) {

return result;
}

inline char* normalize_escape_string(char* input)
{
char* p = input;
char* q;
char ret[MAX_DELIMIT_LEN];
int i = 0;
bool flag = false;
bool space = false;
while (*p)
{
if (*p == ' ')
{
space = true;
}
if (*p == '\\')
{
q = p + 1;
if (*q == 'a')
{
ret[i] = '\a';
flag = true;
}
else if (*q == 'b')
{
ret[i] = '\b';
flag = true;
}
else if (*q == 'f')
{
ret[i] = '\f';
flag = true;
}
else if (*q == 'n')
{
ret[i] = '\n';
flag = true;
}
else if (*q == 'r')
{
ret[i] = '\r';
flag = true;
}
else if (*q == 't')
{
ret[i] = '\t';
flag = true;
}
else if (*q == 'v')
{
ret[i] = '\v';
flag = true;
}
else if (*q == '\\')
{
ret[i] = '\\';
flag = true;
}
if (flag == true)
{
flag = false;
p = q + 1;
i++;
continue;
}
}
ret[i++] = *p;
p++;
}
if (space != true)
{
ret[i++] = ' ';
}
ret[i] = '\0';
return g_strdup(ret);
}
2 changes: 2 additions & 0 deletions src/monk/agent/string_operations.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,4 +47,6 @@ int tokensEquals(const GArray* a, const GArray* b);

size_t token_position_of(size_t index, const GArray* tokens);

char* normalize_escape_string(char* input);

#endif // MONK_AGENT_STRING_OPERATIONS_H
6 changes: 3 additions & 3 deletions src/monk/agent_tests/Unit/test_license.c
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ void test_extractLicenses_One() {
CU_ASSERT_STRING_EQUAL(license.shortname, gpl3);

assertTokens(license.tokens,
"gnu", "general", "public", "license", "version", "3,", NULL);
"gnu", "general", "public", "license", "version", "3", NULL);

licenses_free(licenses);
PQclear(licensesResult);
Expand Down Expand Up @@ -232,9 +232,9 @@ void test_extractLicenses_Two() {
CU_ASSERT_STRING_EQUAL(license1.shortname, gpl2);

assertTokens(license0.tokens,
"gnu", "general", "public", "license", "version", "3,", NULL);
"gnu", "general", "public", "license", "version", "3", NULL);
assertTokens(license1.tokens,
"gnu", "general", "public", "license,", "version", "2", NULL);
"gnu", "general", "public", "license", "version", "2", NULL);

licenses_free(licenses);
}
Expand Down
7 changes: 5 additions & 2 deletions src/monk/agent_tests/Unit/test_string_operations.c
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,10 @@ void test_tokenize() {
}

void test_tokenizeWithSpecialDelims() {
char* test = g_strdup("/*foo \n * bar \n *baz*/ ***booo \n:: qoo ");
char* test = g_strdup("/*foo \n * bar \n *baz*/ ***booo \n:: qoo \ndnl zit ");

GArray* token = tokenize(test, " \n");
CU_ASSERT_EQUAL(token->len, 5);
CU_ASSERT_EQUAL(token->len, 6);
CU_ASSERT_EQUAL(g_array_index(token, Token, 0).hashedContent, hash("foo"));
CU_ASSERT_EQUAL(g_array_index(token, Token, 0).length, 3);
CU_ASSERT_EQUAL(g_array_index(token, Token, 0).removedBefore, 2);
Expand All @@ -62,6 +62,9 @@ void test_tokenizeWithSpecialDelims() {
CU_ASSERT_EQUAL(g_array_index(token, Token, 4).hashedContent, hash("qoo"));
CU_ASSERT_EQUAL(g_array_index(token, Token, 4).length, 3);
CU_ASSERT_EQUAL(g_array_index(token, Token, 4).removedBefore, 5);
CU_ASSERT_EQUAL(g_array_index(token, Token, 5).hashedContent, hash("zit"));
CU_ASSERT_EQUAL(g_array_index(token, Token, 5).length, 3);
CU_ASSERT_EQUAL(g_array_index(token, Token, 5).removedBefore, 6);
g_array_free(token, TRUE);
g_free(test);
}
Expand Down
6 changes: 4 additions & 2 deletions src/www/ui/change-license-bulk.php
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ protected function handle(Request $request)
{
$uploadTreeId = intval($request->get('uploadTreeId'));
if ($uploadTreeId <= 0) {
return new JsonResponse(array("error" => 'bad request'), JsonResponse::HTTP_INTERNAL_SERVER_ERROR);
return new JsonResponse(array("error" => 'bad request'), JsonResponse::HTTP_BAD_REQUEST);
}

try {
Expand Down Expand Up @@ -109,13 +109,15 @@ private function getJobQueueId($uploadTreeId, Request $request)
$refText = $request->get('refText');
$actions = $request->get('bulkAction');
$ignoreIrrelevantFiles = (intval($request->get('ignoreIrre')) == 1);
$delimiters = $request->get('delimiters');

$licenseRemovals = array();
foreach ($actions as $licenseAction) {
$licenseRemovals[$licenseAction['licenseId']] = array(($licenseAction['action']=='Remove'), $licenseAction['comment'], $licenseAction['reportinfo'], $licenseAction['acknowledgement']);
}
$bulkId = $this->licenseDao->insertBulkLicense($userId, $groupId,
$uploadTreeId, $licenseRemovals, $refText, $ignoreIrrelevantFiles);
$uploadTreeId, $licenseRemovals, $refText, $ignoreIrrelevantFiles,
$delimiters);

if ($bulkId <= 0) {
throw new Exception('cannot insert bulk reference');
Expand Down
16 changes: 11 additions & 5 deletions src/www/ui/core-schema.dat
Original file line number Diff line number Diff line change
Expand Up @@ -1135,10 +1135,7 @@
$Schema["TABLE"]["license_candidate"]["group_fk"]["ADD"] = "ALTER TABLE \"license_candidate\" ADD COLUMN \"group_fk\" int8";
$Schema["TABLE"]["license_candidate"]["group_fk"]["ALTER"] = "ALTER TABLE \"license_candidate\" ALTER COLUMN \"group_fk\" DROP NOT NULL";

$Schema["TABLE"]["license_ref_bulk"]["lrb_pk"]["DESC"] = "COMMENT ON COLUMN \"license_ref_bulk\".\"lrb_pk\" IS 'Primary Key'";
$Schema["TABLE"]["license_ref_bulk"]["lrb_pk"]["ADD"] = "ALTER TABLE \"license_ref_bulk\" ADD COLUMN \"lrb_pk\" int8 DEFAULT nextval('license_ref_bulk_lrb_pk_seq'::regclass)";
$Schema["TABLE"]["license_ref_bulk"]["lrb_pk"]["ALTER"] = "ALTER TABLE \"license_ref_bulk\" ALTER COLUMN \"lrb_pk\" SET NOT NULL, ALTER COLUMN \"lrb_pk\" SET DEFAULT nextval('license_ref_bulk_lrb_pk_seq'::regclass)";


$Schema["TABLE"]["license_candidate"]["rf_creationdate"]["DESC"] = "COMMENT ON COLUMN \"license_candidate\".\"rf_creationdate\" IS 'License creation date'";
$Schema["TABLE"]["license_candidate"]["rf_creationdate"]["ADD"] = "ALTER TABLE \"license_candidate\" ADD COLUMN \"rf_creationdate\" timestamptz DEFAULT now()";
$Schema["TABLE"]["license_candidate"]["rf_creationdate"]["ALTER"] = "ALTER TABLE \"license_candidate\" ALTER COLUMN \"rf_creationdate\" DROP NOT NULL, ALTER COLUMN \"rf_creationdate\" SET DEFAULT now()";
Expand All @@ -1155,6 +1152,11 @@
$Schema["TABLE"]["license_candidate"]["rf_user_fk_modified"]["ADD"] = "ALTER TABLE \"license_candidate\" ADD COLUMN \"rf_user_fk_modified\" int4";
$Schema["TABLE"]["license_candidate"]["rf_user_fk_modified"]["ALTER"] = "ALTER TABLE \"license_candidate\" ALTER COLUMN \"rf_user_fk_modified\" DROP NOT NULL";


$Schema["TABLE"]["license_ref_bulk"]["lrb_pk"]["DESC"] = "COMMENT ON COLUMN \"license_ref_bulk\".\"lrb_pk\" IS 'Primary Key'";
$Schema["TABLE"]["license_ref_bulk"]["lrb_pk"]["ADD"] = "ALTER TABLE \"license_ref_bulk\" ADD COLUMN \"lrb_pk\" int8 DEFAULT nextval('license_ref_bulk_lrb_pk_seq'::regclass)";
$Schema["TABLE"]["license_ref_bulk"]["lrb_pk"]["ALTER"] = "ALTER TABLE \"license_ref_bulk\" ALTER COLUMN \"lrb_pk\" SET NOT NULL, ALTER COLUMN \"lrb_pk\" SET DEFAULT nextval('license_ref_bulk_lrb_pk_seq'::regclass)";

$Schema["TABLE"]["license_ref_bulk"]["user_fk"]["DESC"] = "COMMENT ON COLUMN \"license_ref_bulk\".\"user_fk\" IS 'user who made this bulk scan'";
$Schema["TABLE"]["license_ref_bulk"]["user_fk"]["ADD"] = "ALTER TABLE \"license_ref_bulk\" ADD COLUMN \"user_fk\" int8";
$Schema["TABLE"]["license_ref_bulk"]["user_fk"]["ALTER"] = "ALTER TABLE \"license_ref_bulk\" ALTER COLUMN \"user_fk\" SET NOT NULL";
Expand All @@ -1163,7 +1165,7 @@
$Schema["TABLE"]["license_ref_bulk"]["group_fk"]["ADD"] = "ALTER TABLE \"license_ref_bulk\" ADD COLUMN \"group_fk\" int8";
$Schema["TABLE"]["license_ref_bulk"]["group_fk"]["ALTER"] = "ALTER TABLE \"license_ref_bulk\" ALTER COLUMN \"group_fk\" DROP NOT NULL";

$Schema["TABLE"]["license_ref_bulk"]["rf_text"]["DESC"] = "COMMENT ON COLUMN \"license_ref_bulk\".\"rf_text\" IS 'text searched by nulk scan'";
$Schema["TABLE"]["license_ref_bulk"]["rf_text"]["DESC"] = "COMMENT ON COLUMN \"license_ref_bulk\".\"rf_text\" IS 'text searched by bulk scan'";
$Schema["TABLE"]["license_ref_bulk"]["rf_text"]["ADD"] = "ALTER TABLE \"license_ref_bulk\" ADD COLUMN \"rf_text\" text";
$Schema["TABLE"]["license_ref_bulk"]["rf_text"]["ALTER"] = "ALTER TABLE \"license_ref_bulk\" ALTER COLUMN \"rf_text\" SET NOT NULL";

Expand All @@ -1179,6 +1181,10 @@
$Schema["TABLE"]["license_ref_bulk"]["ignore_irrelevant"]["ADD"] = "ALTER TABLE \"license_ref_bulk\" ADD COLUMN \"ignore_irrelevant\" bool DEFAULT true";
$Schema["TABLE"]["license_ref_bulk"]["ignore_irrelevant"]["ALTER"] = "ALTER TABLE \"license_ref_bulk\" ALTER COLUMN \"ignore_irrelevant\" SET NOT NULL, ALTER COLUMN \"ignore_irrelevant\" SET DEFAULT true";

$Schema["TABLE"]["license_ref_bulk"]["bulk_delimiters"]["DESC"] = "COMMENT ON COLUMN \"license_ref_bulk\".\"bulk_delimiters\" IS 'What delimiters to use for scan?'";
$Schema["TABLE"]["license_ref_bulk"]["bulk_delimiters"]["ADD"] = "ALTER TABLE \"license_ref_bulk\" ADD COLUMN \"bulk_delimiters\" text DEFAULT NULL";
$Schema["TABLE"]["license_ref_bulk"]["bulk_delimiters"]["ALTER"] = "ALTER TABLE \"license_ref_bulk\" ALTER COLUMN \"bulk_delimiters\" SET DEFAULT true";


$Schema["TABLE"]["license_set_bulk"]["rf_fk"]["DESC"] = "COMMENT ON COLUMN \"license_set_bulk\".\"rf_fk\" IS 'reference to license_ref* (not only license_ref)'";
$Schema["TABLE"]["license_set_bulk"]["rf_fk"]["ADD"] = "ALTER TABLE \"license_set_bulk\" ADD COLUMN \"rf_fk\" int8";
Expand Down
Loading

0 comments on commit 4d855cf

Please sign in to comment.