Skip to content

Commit 991198f

Browse files
committed
Improve the UUID4 detection.
We already had code for spotting this in search_trie, so improved that a little and use it in encode_name instead of having a second scan. Also improve the compression of mixed data sets. This still isn't optimal as we'd need to start separating the name classes and adding NOP tokens, but it's often a 10-20% compression improvement.
1 parent c666474 commit 991198f

1 file changed

Lines changed: 15 additions & 26 deletions

File tree

htscodecs/tokenise_name3.c

Lines changed: 15 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -610,12 +610,16 @@ int search_trie(name_context *ctx, char *data, size_t len, int n, int *exact, in
610610
prefix_len = 6; // IonTorrent
611611
*fixed_len = 6;
612612
*is_fixed = 1;
613-
} else if (l > 37 && d[f+8] == '-' && d[f+13] == '-' && d[f+18] == '-' && d[f+23] == '-' &&
614-
((d[f+0] >= '0' && d[f+0] <='9') || (d[f+0] >= 'a' && d[f+0] <= 'f')) &&
615-
((d[f+35] >= '0' && d[f+35] <='9') || (d[f+35] >= 'a' && d[f+35] <= 'f'))) {
613+
} else if (l >= 36
614+
&& d[f+8]=='-' && d[f+13]=='-' && d[f+18]=='-' && d[f+23]=='-'
615+
&& isxdigit((uint8_t)d[f+0]) && isxdigit((uint8_t)d[f+7])
616+
&& isxdigit((uint8_t)d[f+9]) && isxdigit((uint8_t)d[f+12])
617+
&& isxdigit((uint8_t)d[f+14]) && isxdigit((uint8_t)d[f+17])
618+
&& isxdigit((uint8_t)d[f+19]) && isxdigit((uint8_t)d[f+22])
619+
&& isxdigit((uint8_t)d[f+24]) && isxdigit((uint8_t)d[f+35])) {
616620
// ONT: f33d30d5-6eb8-4115-8f46-154c2620a5da_Basecall_1D_template...
617-
prefix_len = 37;
618-
*fixed_len = 37;
621+
prefix_len = 36;
622+
*fixed_len = 36;
619623
*is_fixed = 1;
620624
} else {
621625
// Check Illumina and trim back to lane:tile:x:y.
@@ -638,7 +642,6 @@ int search_trie(name_context *ctx, char *data, size_t len, int n, int *exact, in
638642
*is_fixed = 0;
639643
}
640644
}
641-
//prefix_len = INT_MAX;
642645

643646
if (!ctx->t_head) {
644647
ctx->t_head = calloc(1, sizeof(*ctx->t_head));
@@ -647,6 +650,7 @@ int search_trie(name_context *ctx, char *data, size_t len, int n, int *exact, in
647650
}
648651

649652
// Find an item in the trie
653+
int from_punct = from;
650654
for (nlines = i = 0; i < len; i++, nlines++) {
651655
t = ctx->t_head;
652656
while (i < len && data[i] > '\n') {
@@ -661,24 +665,18 @@ int search_trie(name_context *ctx, char *data, size_t len, int n, int *exact, in
661665
x = x->sibling;
662666
t = x;
663667

664-
// t = t->next[c];
665-
666-
// if (!t)
667-
// return -1;
668-
669668
from = t->n;
669+
if ((ispunct(c) || isspace(c)) && t->n != n)
670+
from_punct = t->n;
670671
if (i == prefix_len) p3 = t->n;
671-
//if (t->count >= .0035*ctx->t_head->count && t->n != n) p3 = t->n; // pacbio
672-
//if (i == 60) p3 = t->n; // pacbio
673-
//if (i == 7) p3 = t->n; // iontorrent
674672
t->n = n;
675673
}
676674
}
677675

678676
//printf("Looked for %d, found %d, prefix %d\n", n, from, p3);
679677

680678
*exact = (n != from) && len;
681-
return *exact ? from : p3;
679+
return *exact ? from : (p3 != -1 ? p3 : from_punct);
682680
}
683681

684682

@@ -731,17 +729,8 @@ static int encode_name(name_context *ctx, char *name, int len, int mode) {
731729
encode_token_diff(ctx, cnum-pnum);
732730
int ntok = 1;
733731

734-
// Look for common form of UUID4 names and special case them
735-
i = 0;
736-
if (len == 36) {
737-
for (i = 0; i < len; i++) {
738-
if (!(isxdigit((uint8_t)name[i]) || name[i] == '-'))
739-
break;
740-
}
741-
}
742-
743-
// Is uuid4 (eg ONT).
744-
if (i == len) {
732+
// ONT uuid4: identified in search_trie
733+
if (fixed_len == 36) {
745734
if (37 >= ctx->max_tok) {
746735
do {
747736
memset(&ctx->desc[ctx->max_tok << 4], 0, 16*sizeof(ctx->desc[0]));

0 commit comments

Comments
 (0)