forked from datagrok-ai/public
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsequence-space.ts
62 lines (58 loc) · 2.62 KB
/
sequence-space.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import * as ui from 'datagrok-api/ui';
import * as DG from 'datagrok-api/dg';
import * as grok from 'datagrok-api/grok';
import {BitArrayMetrics} from '@datagrok-libraries/ml/src/typed-metrics';
import {mmDistanceFunctionArgs} from '@datagrok-libraries/ml/src/macromolecule-distance-functions/types';
import {SeqHandler} from '@datagrok-libraries/bio/src/utils/seq-handler';
import {getMonomerSubstitutionMatrix} from '@datagrok-libraries/bio/src/monomer-works/monomer-utils';
import {MmDistanceFunctionsNames} from '@datagrok-libraries/ml/src/macromolecule-distance-functions';
export interface ISequenceSpaceResult {
distance?: Float32Array;
coordinates: DG.ColumnList;
}
export async function getEncodedSeqSpaceCol(
seqCol: DG.Column, similarityMetric: BitArrayMetrics | MmDistanceFunctionsNames, fingerprintType: string = 'Morgan'
): Promise<{ seqList: string[], options: { [_: string]: any } }> {
// encodes sequences using utf characters to also support multichar and non fasta sequences
const rowCount = seqCol.length;
const sh = SeqHandler.forColumn(seqCol);
const encList = Array<string>(rowCount);
let charCodeCounter = 1; // start at 1, 0 is reserved for null.
const charCodeMap = new Map<string, string>();
const seqColCats = seqCol.categories;
const seqColRawData = seqCol.getRawData();
for (let rowIdx = 0; rowIdx < rowCount; rowIdx++) {
const catI = seqColRawData[rowIdx];
const seq = seqColCats[catI];
if (seq === null || seqCol.isNone(rowIdx)) {
//@ts-ignore
encList[rowIdx] = null;
continue;
}
encList[rowIdx] = '';
const splittedSeq = sh.getSplitted(rowIdx);
for (let j = 0; j < splittedSeq.length; j++) {
const char = splittedSeq.getCanonical(j);
if (!charCodeMap.has(char)) {
charCodeMap.set(char, String.fromCharCode(charCodeCounter));
charCodeCounter++;
}
encList[rowIdx] += charCodeMap.get(char)!;
}
}
let options = {} as mmDistanceFunctionArgs;
if (
similarityMetric === MmDistanceFunctionsNames.MONOMER_CHEMICAL_DISTANCE ||
similarityMetric === MmDistanceFunctionsNames.NEEDLEMANN_WUNSCH
) {
const monomers = Array.from(charCodeMap.keys());
const monomerRes = await getMonomerSubstitutionMatrix(monomers, fingerprintType);
const monomerHashToMatrixMap: { [_: string]: number } = {};
Object.entries(monomerRes.alphabetIndexes).forEach(([key, value]) => {
monomerHashToMatrixMap[charCodeMap.get(key)!] = value;
});
// sets distance function args in place.
options = {scoringMatrix: monomerRes.scoringMatrix, alphabetIndexes: monomerHashToMatrixMap};
}
return {seqList: encList, options};
}