vox-core
Version:
Runtime de aplicaciones multiplataforma
421 lines (399 loc) • 15.7 kB
JavaScript
/** Burrows-Wheeler transform, computed with the Induced Sorting Suffix Array
* construction mechanism (sais). Code is a port of:
* https://sites.google.com/site/yuta256/sais
* which is:
* Copyright (c) 2008-2010 Yuta Mori All Rights Reserved.
* and licensed under an MIT/X11 license. I generally looked at both
* the C and the Java implementations to guide my work.
*
* This JavaScript port is:
* Copyright (c) 2013 C. Scott Ananian
* and licensed under GPLv2; see the README at the top level of this package.
*/
if (typeof define !== 'function') { var define = require('amdefine')(module); }
define(['./freeze', './Util'], function(freeze, Util) {
var ASSERT = console.assert.bind(console);
// we're dispensing with the "arbitrary alphabet" stuff of the source
// and just using Uint8Arrays.
/** Find the start or end of each bucket. */
var getCounts = function(T, C, n, k) {
var i;
for (i = 0; i < k; i++) { C[i] = 0; }
for (i = 0; i < n; i++) { C[T[i]]++; }
};
var getBuckets = function(C, B, k, end) {
var i, sum = 0;
if (end) {
for (i = 0; i < k; i++) { sum += C[i]; B[i] = sum; }
} else {
for (i = 0; i < k; i++) { sum += C[i]; B[i] = sum - C[i]; }
}
};
/** Sort all type LMS suffixes */
var LMSsort = function(T, SA, C, B, n, k) {
var b, i, j;
var c0, c1;
/* compute SAl */
if (C === B) { getCounts(T, C, n, k); }
getBuckets(C, B, k, false); /* find starts of buckets */
j = n - 1;
b = B[c1 = T[j]];
j--;
SA[b++] = (T[j] < c1) ? ~j : j;
for (i = 0; i < n; i++) {
if ((j = SA[i]) > 0) {
ASSERT(T[j] >= T[j+1]);
if ((c0 = T[j]) !== c1) { B[c1] = b; b = B[c1 = c0]; }
ASSERT(i < b);
j--;
SA[b++] = (T[j] < c1) ? ~j : j;
SA[i] = 0;
} else if (j < 0) {
SA[i] = ~j;
}
}
/* compute SAs */
if (C === B) { getCounts(T, C, n, k); }
getBuckets(C, B, k, 1); /* find ends of buckets */
for (i = n-1, b = B[c1 = 0]; i >= 0; i--) {
if ((j = SA[i]) > 0) {
ASSERT(T[j] <= T[j+1]);
if ((c0 = T[j]) !== c1) { B[c1] = b; b = B[c1 = c0]; }
ASSERT(b <= i);
j--;
SA[--b] = (T[j] > c1) ? ~(j+1) : j;
SA[i] = 0;
}
}
};
var LMSpostproc = function(T, SA, n, m) {
var i, j, p, q, plen, qlen, name;
var c0, c1;
var diff;
/* compact all the sorted substrings into the first m items of SA
* 2*m must not be larger than n (provable) */
ASSERT(n > 0);
for (i = 0; (p = SA[i]) < 0; i++) { SA[i] = ~p; ASSERT((i+1) < n); }
if (i < m) {
for (j = i, i++; ; i++) {
ASSERT(i < n);
if ((p = SA[i]) < 0) {
SA[j++] = ~p; SA[i] = 0;
if (j === m) { break; }
}
}
}
/* store the length of all substrings */
c0 = T[i = j = n - 1];
do { c1 = c0; } while ( ((--i) >= 0 ) && ((c0=T[i]) >= c1) );
for (; i >= 0; ) {
do { c1 = c0; } while ( ((--i) >= 0 ) && ((c0=T[i]) <= c1) );
if (i >= 0) {
SA[m + ((i + 1) >>> 1)] = j - i; j = i + 1;
do { c1 = c0; } while ( ((--i) >= 0 ) && ((c0=T[i]) >= c1) );
}
}
/* find the lexicographic names of all substrings */
for (i = 0, name = 0, q = n, qlen = 0; i < m; i++) {
p = SA[i]; plen = SA[m + (p >>> 1)]; diff = true;
if ((plen === qlen) && ((q + plen) < n)) {
for (j = 0; (j < plen) && (T[p + j] === T[q + j]); ) { j++; }
if (j === plen) { diff = false; }
}
if (diff) { name++; q = p; qlen = plen; }
SA[m + (p >>> 1)] = name;
}
return name;
};
/* compute SA and BWT */
var induceSA = function(T, SA, C, B, n, k) {
var b, i, j;
var c0, c1;
/* compute SAl */
if (C === B) { getCounts(T, C, n, k); }
getBuckets(C, B, k, false); /* find starts of buckets */
j = n - 1;
b = B[c1 = T[j]];
SA[b++] = ((j > 0) && (T[j-1] < c1)) ? ~j : j;
for (i = 0; i < n; i++) {
j = SA[i]; SA[i] = ~j;
if (j > 0) {
j--;
ASSERT( T[j] >= T[j + 1] );
if ((c0 = T[j]) !== c1) { B[c1] = b; b = B[c1=c0]; }
ASSERT( i < b );
SA[b++] = ((j > 0) && (T[j-1] < c1)) ? ~j : j;
}
}
/* compute SAs */
if (C === B) { getCounts(T, C, n, k); }
getBuckets(C, B, k, true); /* find ends of buckets */
for (i = n-1, b = B[c1 = 0]; i >= 0; i--) {
if ((j = SA[i]) > 0) {
j--;
ASSERT( T[j] <= T[j + 1] );
if ((c0 = T[j]) !== c1) { B[c1] = b; b = B[c1 = c0]; }
ASSERT( b <= i );
SA[--b] = ((j === 0) || (T[j - 1] > c1)) ? ~j : j;
} else {
SA[i] = ~j;
}
}
};
var computeBWT = function(T, SA, C, B, n, k) {
var b, i, j, pidx = -1;
var c0, c1;
/* compute SAl */
if (C === B) { getCounts(T, C, n, k); }
getBuckets(C, B, k, false); /* find starts of buckets */
j = n - 1;
b = B[c1 = T[j]];
SA[b++] = ((j > 0) && (T[j - 1] < c1)) ? ~j : j;
for (i = 0; i < n; i++) {
if ((j=SA[i]) > 0) {
j--;
ASSERT( T[j] >= T[j+1] );
SA[i] = ~(c0 = T[j]);
if (c0 !== c1) { B[c1] = b; b = B[c1 = c0]; }
ASSERT( i < b );
SA[b++] = ((j > 0) && (T[j - 1] < c1)) ? ~j : j;
} else if (j !== 0) {
SA[i] = ~j;
}
}
/* compute SAs */
if (C === B) { getCounts(T, C, n, k); }
getBuckets(C, B, k, true); /* find ends of buckets */
for (i = n-1, b = B[c1 = 0]; i >= 0; i--) {
if ((j = SA[i]) > 0) {
j--;
ASSERT( T[j] <= T[j+1] );
SA[i] = c0 = T[j];
if (c0 !== c1) { B[c1] = b; b = B[c1 = c0]; }
ASSERT( b <= i );
SA[--b] = ((j > 0) && (T[j-1] > c1)) ? (~T[j-1]) : j;
} else if (j !== 0) {
SA[i] = ~j;
} else {
pidx = i;
}
}
return pidx;
};
/* find the suffix array SA of T[0..n-1] in {0..k-1}^n
use a working space (excluding T and SA) of at most 2n+O(1) for a
constant alphabet */
var SA_IS = function(T, SA, fs, n, k, isbwt) {
var C, B, RA;
var i, j, b, c, m, p, q, name, pidx = 0, newfs;
var c0, c1;
var flags = 0;
// allocate temporary storage [CSA]
if (k <= 256) {
C = Util.makeS32Buffer(k);
if (k <= fs) { B = SA.subarray(n + fs - k); flags = 1; }
else { B = Util.makeS32Buffer(k); flags = 3; }
} else if (k <= fs) {
C = SA.subarray(n + fs - k);
if (k <= (fs - k)) { B = SA.subarray(n + fs - k * 2); flags = 0; }
else if (k <= 1024) { B = Util.makeS32Buffer(k); flags = 2; }
else { B = C; flags = 8; }
} else {
C = B = Util.makeS32Buffer(k);
flags = 4 | 8;
}
/* stage 1: reduce the problem by at least 1/2
sort all the LMS-substrings */
getCounts(T, C, n, k);
getBuckets(C, B, k, true); /* find ends of buckets */
for (i = 0; i < n; i++) { SA[i] = 0; }
b = -1; i = n - 1; j = n; m = 0; c0 = T[n - 1];
do { c1 = c0; } while ((--i >= 0) && ((c0 = T[i]) >= c1));
for (; i >= 0 ;) {
do { c1 = c0; } while ((--i >= 0) && ((c0 = T[i]) <= c1));
if ( i >= 0 ) {
if ( b >= 0 ) { SA[b] = j; }
b = --B[c1];
j = i;
++m;
do { c1 = c0; } while ((--i >= 0) && ((c0 = T[i]) >= c1));
}
}
if (m > 1) {
LMSsort(T, SA, C, B, n, k);
name = LMSpostproc(T, SA, n, m);
} else if (m === 1) {
SA[b] = j + 1;
name = 1;
} else {
name = 0;
}
/* stage 2: solve the reduced problem
recurse if names are not yet unique */
if(name < m) {
if((flags & 4) !== 0) { C = null; B = null; }
if((flags & 2) !== 0) { B = null; }
newfs = (n + fs) - (m * 2);
if((flags & (1 | 4 | 8)) === 0) {
if((k + name) <= newfs) { newfs -= k; }
else { flags |= 8; }
}
ASSERT( (n >>> 1) <= (newfs + m) );
for (i = m + (n >>> 1) - 1, j = m * 2 + newfs - 1; m <= i; i--) {
if(SA[i] !== 0) { SA[j--] = SA[i] - 1; }
}
RA = SA.subarray(m + newfs);
SA_IS(RA, SA, newfs, m, name, false);
RA = null;
i = n - 1; j = m * 2 - 1; c0 = T[n - 1];
do { c1 = c0; } while ((--i >= 0) && ((c0 = T[i]) >= c1));
for (; i >= 0 ;) {
do { c1 = c0; } while ((--i >= 0) && ((c0 = T[i]) <= c1));
if ( i >= 0 ) {
SA[j--] = i + 1;
do { c1 = c0; } while ((--i >= 0) && ((c0 = T[i]) >= c1));
}
}
for (i = 0; i < m; i++) { SA[i] = SA[m + SA[i]]; }
if((flags & 4) !== 0) { C = B = Util.makeS32Buffer(k); }
if((flags & 2) !== 0) { B = Util.makeS32Buffer(k); }
}
/* stage 3: induce the result for the original problem */
if((flags & 8) !== 0) { getCounts(T, C, n, k); }
/* put all left-most S characters into their buckets */
if (m > 1) {
getBuckets(C, B, k, true); /* find ends of buckets */
i = m - 1; j = n; p = SA[m - 1]; c1 = T[p];
do {
q = B[c0 = c1];
while (q < j) { SA[--j] = 0; }
do {
SA[--j] = p;
if(--i < 0) { break; }
p = SA[i];
} while((c1 = T[p]) === c0);
} while (i >= 0 );
while ( j > 0 ) { SA[--j] = 0; }
}
if (!isbwt) { induceSA(T, SA, C, B, n, k); }
else { pidx = computeBWT(T, SA, C, B, n, k); }
C = null; B = null;
return pidx;
};
var BWT = Object.create(null);
/** SA should be a Int32Array (signed!); T can be any typed array.
* alphabetSize is optional if T is an Uint8Array or Uint16Array. */
BWT.suffixsort = function(T, SA, n, alphabetSize) {
ASSERT( T && SA && T.length >= n && SA.length >= n );
if (n <= 1) {
if (n === 1) { SA[0] = 0; }
return 0;
}
if (!alphabetSize) {
if (T.BYTES_PER_ELEMENT === 1) { alphabetSize = 256; }
else if (T.BYTES_PER_ELEMENT === 2) { alphabetSize = 65536; }
else throw new Error('Need to specify alphabetSize');
}
ASSERT( alphabetSize > 0 );
if (T.BYTES_PER_ELEMENT) {
ASSERT( alphabetSize <= (1 << (T.BYTES_PER_ELEMENT*8) ) );
}
return SA_IS(T, SA, 0, n, alphabetSize, false);
};
/** Burrows-Wheeler Transform.
A should be Int32Array (signed!); T can be any typed array.
U is the same type as T (it is used for output).
alphabetSize is optional if T is an Uint8Array or Uint16Array.
ASSUMES STRING IS TERMINATED WITH AN EOF CHARACTER.
*/
BWT.bwtransform = function(T, U, A, n, alphabetSize) {
var i, pidx;
ASSERT( T && U && A );
ASSERT( T.length >= n && U.length >= n && A.length >= n );
if (n <= 1) {
if (n === 1) { U[0] = T[0]; }
return n;
}
if (!alphabetSize) {
if (T.BYTES_PER_ELEMENT === 1) { alphabetSize = 256; }
else if (T.BYTES_PER_ELEMENT === 2) { alphabetSize = 65536; }
else throw new Error('Need to specify alphabetSize');
}
ASSERT( alphabetSize > 0 );
if (T.BYTES_PER_ELEMENT) {
ASSERT( alphabetSize <= (1 << (T.BYTES_PER_ELEMENT*8) ) );
}
pidx = SA_IS(T, A, 0, n, alphabetSize, true);
U[0] = T[n - 1];
for (i = 0; i < pidx ; i++) { U[i + 1] = A[i]; }
for (i += 1; i < n; i++) { U[i] = A[i]; }
return pidx + 1;
};
/** Reverses transform above. (ASSUMED STRING IS TERMINATED WITH EOF.) */
BWT.unbwtransform = function(T, U, LF, n, pidx) {
var C = Util.makeU32Buffer(256);
var i, t;
for (i=0; i<256; i++) { C[i] = 0; }
for (i=0; i<n; i++) { LF[i] = C[T[i]]++; }
for (i=0, t=0; i<256; i++) { t += C[i]; C[i] = t - C[i]; }
for (i=n-1, t=0; i>=0; i--) {
t = LF[t] + C[U[i]=T[t]];
t += (t<pidx) ? 1 : 0;
}
C = null;
};
/** Burrows-Wheeler Transform.
A should be Int32Array (signed!); T can be any typed array.
U is the same type as T (it is used for output).
alphabetSize is optional if T is an Uint8Array or Uint16Array.
ASSUMES STRING IS CYCLIC.
(XXX: this is twice as inefficient as I'd like! [CSA])
*/
BWT.bwtransform2 = function(T, U, n, alphabetSize) {
var i, j, pidx = 0;
ASSERT( T && U );
ASSERT( T.length >= n && U.length >= n );
if (n <= 1) {
if (n === 1) { U[0] = T[0]; }
return 0;
}
if (!alphabetSize) {
if (T.BYTES_PER_ELEMENT === 1) { alphabetSize = 256; }
else if (T.BYTES_PER_ELEMENT === 2) { alphabetSize = 65536; }
else throw new Error('Need to specify alphabetSize');
}
ASSERT( alphabetSize > 0 );
if (T.BYTES_PER_ELEMENT) {
ASSERT( alphabetSize <= (1 << (T.BYTES_PER_ELEMENT*8) ) );
}
// double length of T
var TT;
if (T.length >= n*2) {
TT = T; // do it in place if possible
} else if (alphabetSize <= 256) {
TT = Util.makeU8Buffer(n*2);
} else if (alphabetSize <= 65536) {
TT = Util.makeU16Buffer(n*2);
} else {
TT = Util.makeU32Buffer(n*2);
}
if (TT!==T) {
for (i=0; i<n; i++) { TT[i] = T[i]; }
}
for (i=0; i<n; i++) { TT[n+i] = TT[i]; }
// sort doubled string
var A = Util.makeS32Buffer(n*2);
SA_IS(TT, A, 0, n*2, alphabetSize, false);
for (i=0, j=0; i<2*n; i++) {
var s = A[i];
if (s < n) {
if (s === 0) { pidx = j; }
if (--s < 0) { s = n-1; }
U[j++] = T[s];
}
}
ASSERT(j===n);
return pidx;
};
return freeze(BWT);
});