//  crm_fast_substring_compression.c  - CRM114 - version v1.0
//  Copyright 2001-2008  William S. Yerazunis, all rights reserved.
//
//  This software is licensed to the public under the Free Software
//  Foundation's GNU GPL, version 2.  You may obtain a copy of the
//  GPL by visiting the Free Software Foundations web site at
//  www.fsf.org, and a copy is included in this distribution.
//
//  Other licenses may be negotiated; contact the
//  author for details.
//
//  include some standard files
#include "crm114_sysincludes.h"

//  include any local crm114 configuration file
#include "crm114_config.h"

//  include the crm114 data structures file
#include "crm114_structs.h"

//  and include the routine declarations file
#include "crm114.h"




#if !CRM_WITHOUT_FSCM



/////////////////////////////////////////////////////////////////
//
//     Compression-match classification
//
//     This classifier is based on the use of the Lempel-Ziv LZ77
//     (published in 1977) algorithm for fast compression; more
//     compression implies better match.
//
//     The basic idea of LZ77 is to encode strings of N characters
//     as a small doublet (Nstart, Nlen), where Nstart and Nlen are
//     backward references into previously seen text.  If there's
//     no previous correct back-reference string, then don't compress
//     those characters.
//
//     Thus, LZ77 is a form of _universal_ compressor; it starts out knowing
//     nothing of what it's to compress, and develops compression
//     tables "on the fly".
//
//     It is well known that one way of doing text matching is to
//     compare relative compressibility - that is, given known
//     texts K1 and K2, the unknown text U is in the same class as K1
//     iff the LZ77 compression of K1|U is smaller (fewer bytes) than
//     the LZ77 compression of K2|U .  (remember you need to subtract
//     the compressed lengths of K1 and K2 respectively).
//
//     There are several ways to generate LZ compression fast; one
//     way is by forward pointers on N-letter prefixes.  Another
//     way is to decide on a maximum string depth and build transfer
//     tables.
//
//     One problem with LZ77 is that finding the best possible compression
//     is NP-hard.  Consider this example:
//
//         ABCDEFGHI DEFGHIJLMN BCDEFGHIJ JKLMNOPQ ABCDEFGHIJKLMNOP
//
//     Is it better to code the final part with the A-J segment
//     followed by the J-P segment, or with a literal ABC, then D-N,
//     then the literal string OP?  Without doing the actual math, you
//     can't easily decide which is the better compression.  In the
//     worst case, the problem becomes the "knapsack" problem and
//     is thus NP-hard.
//
//     To avoid this, we take the following heuristic for our first
//     coding attempt:
//
//          Look for the longest match of characters that
//          match the unknown at this point and use that.
//
//     In the worst case, this algorithm will traverse the entire
//     known text for each possible starting character, and possibly
//     do a local search if that starting character matches the
//     character in the unknown text.  Thus, the time to run is
//     bounded by |U| * |K|.
//
//     Depending on the degree of overlap between strings, this
//     heuristic will be at best no worse than half as good as the
//     best possible heuristic, and (handwave not-proven) not worse
//     than one quarter as good as the best possible heuristic.
//
//     As a speedup, we can use a prefix lookforward based on N
//     (the number of characters we reqire to match before switching
//     from literal characters to start/length ranges).   Each character
//     also carries an index, saying "for the N-character lookforward
//     prefix I am at the start of, you can find another example of this
//     at the following index."
//
//     For example, the string "ABC DEF ABC FGH ABC XYZ" would have
//     these entries inserted sequentially into the lookforward table:
//
//     ABC --> 0
//     BC  --> 1
//     C D --> 2
//      DE --> 3
//     DEF --> 4
//     EF  --> 5
//     F A --> 6
//      AB --> 7
//
//    At this point, note that "ABC" recurs again.  Since we want to
//    retain both references to "ABC" strings, we place the index of
//    the second ABC (== 8) into the "next occurrence" tag of the
//    first "ABC".  (or, more efficiently, set the second ABC to point
//    to the first ABC, and then have the lookforward table point to the
//    second ABC (thus, the chain of ABCs is actually in the reverse order
//    of their encounters).
//
//    For prefix lengths of 1, 2, and 3, the easiest method is to
//    direct-map the prefixes into a table.  The table lengths would
//    be 256, 65536, and 16 megawords (64 megabytes).  The first two are
//    eminently tractable; the third marginally so.  The situation
//    can be improved by looking only at the low order six bits of
//    the characters as addresses into the direct map table.  For normal
//    ASCII, this means that the control characters are mapped over the
//    capital letters, and the digits and punctuation are mapped over
//    lowercase letters, and uses up only 16 megabytes for the table entries.
//
//    Of course, a direct-mapped, 1:1 table is not the only possible
//    table.  It is also possible to create a hash table with overflow
//    chains.  For example, an initial two-character table (256Kbytes) yields
//    the start of a linear-search chain; this chain points to a linked list of
//    all of the third characters yet encountered.
//
//    Here's some empirical data to get an idea of the size of the
//    table actually required:
//
//    for SA's hard_ham
//       lines         words       three-byte sequences
//         114K          464K        121K
//          50K          210K         61K
//          25K          100K         47K
//          10K           47K         31K
//           5K           24K         21K
//
//    For SA's easy_ham_2:
//       lines         words       three-byte sequences
//         134K          675K          97K
//          25K          130K          28K
//
//    For SA's spam_2:
//       lines         words       three-byte sequences
//         197K          832K          211K
//         100K          435K          116K
//
//    So, it looks like in the long term (and in English) there is
//    an expectation of roughly as many 3-byte sequences as there are
//    lines of text, probably going asymptotic at around
//    a quarter of a million unique 3-byte sequences.  Note that
//    the real maximum is 2^24 or about 16 million three-byte
//    sequences; however some of them would never occur except in
//    binary encodings.
//
//
//     ----- Embedded Limited-Length Markov Table Option -----
//
//    Another method of accomplishing this (suitable for N of 3 and larger)
//    is to use transfer tables allocated only on an as-needed basis,
//    and to store the text in the tables directly (thus forming a sort of
//    Markov chain of successive characters in memory).
//
//    The root table contains pointers to the inital occurrence in the known
//    text of all 256 possible first bytes; each subsequent byte then
//    allocates another transfer table up to some predetermined limit on
//    length.
//
//    A disadvantage of this method is that it uses up more memory for
//    storage than the index chaining method; further it must (by necessity)
//    "cut off" at some depth thereby limiting the longest string that
//    we want to allocate another table for.  In the worst case, this
//    system generates a doubly diagonal tree with |K|^2 / 4 tables.
//    On the other hand, if there is a cutoff length L beyond which we
//    don't expand the tables, then only the tables that are needed get
//    allocated.  As a nice side effect, the example text becomes less
//    trivial to extract (although it's there, you have to write a
//    program to extract it rather than just using "strings", unlike the
//    bit entropy classifier where a well-populated array contains a lot
//    of uncertainty and it's very difficult to even get a single
//    byte unambiguously.
//
//    Some empirical data on this method:
//
//     N = 10
//       Text length (bytes)       Tables allocated
//           1211                       7198
//          69K                       232K
//          96K                       411K
//         204K                       791K
//
//     N=5
//       Text length (bytes)       Tables allocated
//            1210                      2368
//           42K                       47K
//           87K                       79K
//          183K                      114K
//          386K                      177K
//          841K                      245K
//         1800K                      342K
//         3566K                      488K
//         6070K                      954K
//         8806K                     1220K
//
//      N=4
//       Text length (bytes)       Tables allocated
//           87K                       40K
//          183K                       59K
//          338K                       89K
//          840K                      121K
//         1800K                      167K
//         3568K                      233K
//         6070K                      438K
//
//      N=3
//       Text length (bytes)       Tables allocated
//           87K                       14K
//          183K                       22K
//          386K                       31K
//          840K                       42K
//         1800K                       58K
//         3568K                       78K
//         6070K                      132K
//
//
//    Let's play with the numbers a little bit.  Note that
//    the 95 printable ASCII characters could have a theoretical
//    maximum of 857K sequences, and the full 128-character ASCII
//    (low bit off) is 2.09 mega-sequences.  If we casefold A-Z onto
//    a-z and all control characters to "space", then the resulting
//    69 characters is only 328K possible unique sequences.
//
//    A simpler method is to fold 0x7F-0x8F down onto 0x00-0x7F, and
//    then 0x00-0x3F onto 0x40-0x7F (yielding nothing but printable
//    characters- however, this does not preserve whitespace in any sense).
//    When folded this way, the SA hard_ham corpus (6 mbytes, 454 words, 114
//    K lines) yields 89,715 unique triplets.
//
//    Of course, for other languages, the statistical asymptote is
//    probably present, but casefolding in a non-Roman font is probably
//    going to give us weak results.
//
//    --- Other Considerations ---
//
//    Because it's unpleasant and CPU-consuming to read and write
//    non-binary-format statistics files (memory mapping is far
//    faster) it's slightly preferable to have statistics files
//    that don't have internal structures that change sizes (appending is
//    more pleasant than rewriting with new offsets).  Secondarily,
//    a lot of users are much more comfortable with the knowledge
//    that their statistics files won't grow without bound.  Therefore,
//    fixed-size structures are often preferred.
//
//
//   --- The text storage ---
//
//   In all but the direct-mapped table method, the text itself needs
//   to be stored because the indexing method only says where to look
//   for the first copy of any particular string head, not where all
//   of the copies are.  Thus, each byte of the text needs an index
//   (usually four bytes) of "next match" information.  This index
//   points to the start of the next string that starts with the
//   current N letters.
//
//   Note that it's not necessary for the string to be unique; the
//   next match chains can contain more than one prefix.  As long
//   as the final matching code knows that the first N bytes need
//   to be checked, there's no requirement that chains cannot be
//   shared among multiple N-byte prefixes.  Indeed, in the limit,
//   a simple sequential search can be emulated by a shared-chain
//   system with just ONE chain (each byte's "try next" pointer
//   points to the next byte in line).  These nonunique "try next"
//   chains may be a good way to keep the initial hash table
//   manageabley small.  However, how to efficiently do this
//   "in line" is unclear (the goal of in-line is to hide the
//   known text so that "strings" can't trivially extract it;
//   the obvious solution is to have two data structures (one by
//   bytes, one by uint32's, but the byte structure is then easily
//   perusable).
//
//   Another method would be to have the initial table point not
//   to text directly, but to a lookforward chain.  Within the chain,
//   cells are allocated only when the offset backward exceeds the
//   offset range allowed by the in-line offset size.  For one-byte
//   text and three-byte offsets, this can only happen if the text
//   grows beyond 16 megabytes of characters (64 megabyte footprint)
//
//    --- Hash Tables Revisited ---
//
//   Another method is to have multiple hash entries for every string
//   starting point.  For example, we might hash "ABC DEF ABC", "ABC DEF",
//   and "ABC" and put each of these into the hash table.
//
//   We might even consider that we can _discard_ the original texts
//   if our hash space is large enough that accidental clashes are
//   sufficiently rare.  For example, with a 64-bit hash, the risk of
//   any two particular strings colliding is 1E-19, and the risk of
//   any collision whatsoever does not approach 50% with less than
//   1 E 9 strings in the storage.
//
//     ------- Hashes and Hash Collisions -----
//
//   To see how the hashes would collide with the CRM114 function strnhash,
//   we ran the Spamassasin hard-ham corpus into three-byte groups, and
//   then hashed the groups.  Before hashing, there were 125,434 unique
//   three-byte sequences; after hashing, there were 124,616 unique hashes;
//   this is 818 hash collisions (a rate of 0.65%).  This is a bit higher
//   than predicted for truly randomly chosen inputs, but then again, the
//   SA corpus has very few bytes with the high order bit set.
//
//    ------ Opportunistic chain sharing -----
//
//   (Note- this is NOT being built just yet; it's just an idea) - the
//   big problem with 24-bit chain offsets is that it might not have
//   enough "reach" for the less common trigrams; in the ideal case any
//   matching substring is good enough and losing substrings is anathema.
//   However, if we have a number of sparse chains that are at risk for
//   not being reachable, we can merge those chains either together or
//   onto another chain that's in no dagner of running out of offsets.
//
//   Note that it's not absolutely necessary for the two chains to be
//   sorted together; as long as the chains are connected end-to-end,
//   the result will still be effective.
//
//    -----  Another way to deal with broken chains -----
//
//    (also NOT being built yet; this is just another possibility)
//   Another option: for systems where there are chains that are about
//   to break because the text is exceeding 16 megabytes (the reach of
//   a 24-bit offset), at the end of each sample document we can insert
//   a "dummy" forwarding cell that merely serves to preserve continuity
//   of any chain that might be otherwise broken because the N-letter prefix
//   string has not occured even once in the preceding 16 megacells.
//   (worst case analysis: there are 16 million three-byte prefixes, so
//   if all but ONE prefix was actually ever seen in a 16-meg block, we'd
//   have a minor edge-case problem for systems that did not do chain
//   folding.  With chain-folding down to 18 bits (256K different chains)
//   we'd have no problem at all, even in the worst corner case.)
//
//   However, we still need to insert these chain-fixers preemptively
//   if we want to use "small" lookforward cells, because small (24-bit)
//   cells don't have the reach to be able to index to the first occurrence
//   of a chain that's never been seen in the first 16 megacharacters.
//   This means that at roughly every 16-megacell boundary we would
//   insert a forwarding dummy block (worst case size of 256K cells, on
//   the average a lot fewer because some will actually get used in real
//   chains.) That sounds like a reasonable tradeoff in size, but the
//   bookkeeping to keep it all straight is goint to be painful to code and
//   test rigorously.
//
//
//     ------- Hashes-only storage ------
//
//   In this method, we don't bother to store the actual text _at all_,
//   but we do store chains of places where it occurred in the original
//   text.  In this case, we LEARN by sliding our window of strnhash(N)
//   characters over the text.  Each position generates a four-byte
//   backward index (which may be NULL) to the most recent previous
//   encounter of this prefix; this chain grows basically without limit.
//
//   To CLASSIFY, we again slide our strnhash(N) window over the text;
//   and for each offset position we gather the (possibly empty) list of
//   places where that hash occurred.  Because the indices are pre-sorted
//   (always in descending order) it is O(n) in the length of the chains
//   to find out any commonality because the chains can be traversed by the
//   "two finger method" (same as in the hyperspace classifier).  The
//   result for any specific starting point is the list of N longest matches
//   for the leading character position as seen so far.  If we choose to
//   "commit on N characters matching, then longest that starts in that
//   chain" then the set of possible matches is the tree of indices and
//   we want the longest branch.
//
//   This is perhaps most easily done by an N-finger method where we keep
//   a list of "fingers" to the jth, j+1, j+2... window positions; at each
//   position j+k we merely insure that there is an unbroken path from j+0
//   to j+k.  (we could speed this up significantly by creating a lookaside
//   list or array that contains ONLY the valid positions at j+k; moving the
//   window to k+1 means only having to check through that array to find at
//   least one member equal to the k+1 th window chain.  In this case, the
//   "two-finger" method suffices, and the calculation can be done "in place".
//   When the path breaks (that is, no feasible matches remain), we take
//   N + k - 1 to be the length of the matched substring and begin again
//   at j = N + k + j.
//
//   Another advantage of this method is that there is no stored text to
//   recover directly; a brute-force attack, one byte at a time, will
//   recover texts but not with absolute certainty as hash collisions
//   might lead to unexpected forks in the chain.
//
//
//     ------- Design Decision -----
//
//   Unless something better comes up,  if we just take the strnhash() of the
//   N characters in the prefix, we will likely get a fairly reasonable
//   distribution of hash values which we can then modulo down to whatever
//   size table we're actually using.  Thus, the size of the prefix and the
//   size of the hah table are both freely variable in this design.
//
//   We will use the "hash chains only" method to store the statistics
//   information (thus providing at least moderate obscuration of the
//   text, as well as moderately efficient storage.
//
//   As a research extension, we will allow an arbitrary regex to determine
//   the meaning of the character window; repeated regexing with k+1 starting
//   positions yield what we will define as "legitimately adjacent window
//   positions".  We specifically define that we do not care if these are
//   genuinely adjacent positions; we can define these things as we wish (and
//   thereby become space-invariant, if we so choose.
//
//   A question: should the regex define the next "character", or should
//   it define the entire next window?  The former allows more flexibility
//   (and true invariance over charactersets); the latter is easier to
//   implement and faster at runtime.  Decision: implment as "defines the
//   whole window".  Then we use the count of subexpressions to define our
//   window length; this would allow skipping arbitrary text - with all the
//   programming power and danger of abuse that entails. Under this paradigm,
//   the character regex is /(.)(.)(.)/ for an N=3 minimum chain.
//
//   A quick test shows that strnhash on [a-zA-Z0-9 .,-] shows no
//   duplications, nor any hash clashes when taken mod 256.  Thus,
//   using a Godel coding scheme (that is, where different offsets are
//   each multiplied by a unique prime number and then those products
//   are added together ) will *probably* give good hash results.
//   Because we're moduloing (taking only the low order bits) the
//   prime number "2" is a bit problematic and we may choose to skip it.
//   Note that a superincreasing property is not useful here.
//
//   Note that the entire SA corpus is only about 16 megabytes of
//   text, so a full index set of the SA corpus would be on the
//   order of 68 megabytes ( 4 megs of index, then another 64 megs
//   of index chains)
//
//   Note also that there is really no constraint that the chains start
//   at the low indices and move higher.  It is equally valid for the chains
//   to start at the most recent indices and point lower in memory; this
//   actually has some advantage in speed of indexing; each chain element
//   points to the _previous_ element and we do the two-finger merge
//   toward lower indices.
//
//   Note also that there is no place the actual text or even the actual
//   hashes of the text are stored.  All hashes that map to the same place
//   in the "seen at" table are deemed identical (and no record is kept);
//   similarly each cell of "saved text" is really only a pointer to the
//   most recent previous location where something that mapped to the
//   same hash table bucket was seen).  Reconstruction of the prior text is
//   hence marginal in confidence.
//
///////////////////////////////////////////////////////////


#ifdef NEED_PRIME_NUMBERS
///////////////////////////////////////////////////////////////
//
//     Some prime numbers to use as weights.
//
//     GROT GROT GROT  note that we have a 1 here instead of a 2 as the
//     first prime number!  That's strictly an artifice to use all of the
//     hash bits and is not an indication that we don't know that 2 is prime.

static unsigned int primes[260] = {
    1,      3,      5,      7,     11,     13,     17,     19,     23,     29,
    31,     37,     41,     43,     47,     53,     59,     61,     67,     71,
    73,     79,     83,     89,     97,     101,    103,    107,    109,    113,
    127,    131,    137,    139,    149,    151,    157,    163,    167,    173,
    179,    181,    191,    193,    197,    199,    211,    223,    227,    229,
    233,    239,    241,    251,    257,    263,    269,    271,    277,    281,
    283,    293,    307,    311,    313,    317,    331,    337,    347,    349,
    353,    359,    367,    373,    379,    383,    389,    397,    401,    409,
    419,    421,    431,    433,    439,    443,    449,    457,    461,    463,
    467,    479,    487,    491,    499,    503,    509,    521,    523,    541,
    547,    557,    563,    569,    571,    577,    587,    593,    599,    601,
    607,    613,    617,    619,    631,    641,    643,    647,    653,    659,
    661,    673,    677,    683,    691,    701,    709,    719,    727,    733,
    739,    743,    751,    757,    761,    769,    773,    787,    797,    809,
    811,    821,    823,    827,    829,    839,    853,    857,    859,    863,
    877,    881,    883,    887,    907,    911,    919,    929,    937,    941,
    947,    953,    967,    971,    977,    983,    991,    997,   1009,   1013,
    1019,   1021,   1031,   1033,   1039,   1049,   1051,   1061,   1063,   1069,
    1087,   1091,   1093,   1097,   1103,   1109,   1117,   1123,   1129,   1151,
    1153,   1163,   1171,   1181,   1187,   1193,   1201,   1213,   1217,   1223,
    1229,   1231,   1237,   1249,   1259,   1277,   1279,   1283,   1289,   1291,
    1297,   1301,   1303,   1307,   1319,   1321,   1327,   1361,   1367,   1373,
    1381,   1399,   1409,   1423,   1427,   1429,   1433,   1439,   1447,   1451,
    1453,   1459,   1471,   1481,   1483,   1487,   1489,   1493,   1499,   1511,
    1523,   1531,   1543,   1549,   1553,   1559,   1567,   1571,   1579,   1583,
    1597,   1601,   1607,   1609,   1613,   1619,   1621,   1627,   1637,   1657
};
#endif


////////////////////////////////////////////////////////////////
//
//     Headers and self-identifying files are a good idea; we'll
//     have it happen here.
//

#define FILE_IDENT_STRING "CRM114 Classdata Fast Substring Compression "



//    The prefix array maps a hash to the most recent occurrence of
//    that hash in the text.

typedef struct
{
    uint32_t index;
} FSCM_PREFIX_TABLE_CELL;

typedef struct
{
    uint32_t next;
} FSCM_HASH_CHAIN_CELL;

#ifdef USING_HASH_COMPARE
static int hash_compare(const unsigned int *a, const unsigned int *b)
{
    FSCM_HASH_CHAIN_CELL *pa, *pb;

    pa = (FSCM_HASH_CHAIN_CELL *)a;
    pb = (FSCM_HASH_CHAIN_CELL *)b;
    if (pa->next < pb->next)
        return -1;

    if (pa->next > pb->next)
        return 1;

    return 0;
}
#endif



////////////////////////////////////////////////////////////////////////
//
//    How to learn in FSCM - two parts:
//      1) append our structures to the statistics file in
//         the FSCM_CHAR_STRUCT format;
//      2) index the new structures; if no prior exists on the chain,
//         let previous link be 0, otherwise it's the prior value in the
//         hash table.
//

int crm_fast_substring_learn(CSL_CELL *csl, ARGPARSE_BLOCK *apb,
        char *txtptr, int txtstart, int txtlen)
{
    //     learn the compression version of this input window as
    //     belonging to a particular type.  Note that the word regex
    //     is ignored in this classifier.
    //
    //     learn <flags> (classname)
    //
    int i, j;

    char htext[MAX_PATTERN];         //  the hash name buffer
    char hashfilename[MAX_PATTERN];  // the hashfile name
    FILE *hashf;                     // stream of the hashfile
    unsigned int textoffset, textmaxoffset;
    int hlen;
    int cflags, eflags;

    struct stat statbuf;    //  for statting the statistics file

    char *file_pointer;
    STATISTICS_FILE_HEADER_STRUCT *file_header;
    FSCM_PREFIX_TABLE_CELL *prefix_table;    //  the prefix indexing table,
    unsigned int prefix_table_size = 200000;
    FSCM_HASH_CHAIN_CELL *chains, *newchains;    //  the chain area
    unsigned int newchainstart;                  //  offset in cells of new chains

    int sense;
    int microgroom;
    int unique;
    int use_unigram_features;
    int fev;

    //        unk_hashes is tempbuf, but casting-aliased to FSCM chains
    int unk_hashcount;
    crmhash_t *unk_hashes;

    unk_hashes = (crmhash_t *)tempbuf;



    statbuf.st_size = 0;
    fev = 0;

    if (user_trace)
        fprintf(stderr, "executing an FSCM LEARN\n");


    //           extract the hash file name
    crm_get_pgm_arg(htext, MAX_PATTERN, apb->p1start, apb->p1len);
    hlen = apb->p1len;
    hlen = crm_nexpandvar(htext, hlen, MAX_PATTERN);

    ///////////////////////////////////////////////////////
    //
    //            set our cflags, if needed.  The defaults are
    //            "case" and "affirm", (both zero valued).
    //            and "microgroom" disabled.   Many of these make no
    //            sense (or are unimplemented yet) for FSCM.
    //
    cflags = REG_EXTENDED;
    eflags = 0;
    sense = +1;
    if (apb->sflags & CRM_NOCASE)
    {
        cflags = cflags | REG_ICASE;
        eflags = 1;
        if (user_trace)
            fprintf(stderr, "turning oncase-insensitive match\n");
    }
    if (apb->sflags & CRM_REFUTE)
    {
        sense = -sense;
        /////////////////////////////////////
        //    Take this out when we finally support refutation
        ////////////////////////////////////
        fprintf(stderr, "FSCM Refute is NOT SUPPORTED YET\n");
        return 0;

#if 0 // unreachable code
        if (user_trace)
            fprintf(stderr, " refuting learning\n");
#endif
    }
    microgroom = 0;
    if (apb->sflags & CRM_MICROGROOM)
    {
        microgroom = 1;
        if (user_trace)
            fprintf(stderr, " enabling microgrooming.\n");
    }
    unique = 0;
    if (apb->sflags & CRM_UNIQUE)
    {
        unique = 1;
        if (user_trace)
            fprintf(stderr, " enabling uniqueifying features.\n");
    }

    use_unigram_features = 0;
    if (apb->sflags & CRM_UNIGRAM)
    {
        use_unigram_features = 1;
        if (user_trace)
            fprintf(stderr, " using only unigram features.\n");
    }

    //
    //             grab the filename, and stat the file
    //      note that neither "stat", "fopen", nor "open" are
    //      fully 8-bit or wchar clean...
    // i = 0;
    // while (htext[i] < 0x021) i++;
    // j = i;
    // while (htext[j] >= 0x021) j++;
    if (!crm_nextword(htext, hlen, 0, &i, &j) || j == 0)
    {
        fev = nonfatalerror_ex(SRC_LOC(),
                "\nYou didn't specify a valid filename: '%.*s'\n",
                hlen,
                htext);
        return fev;
    }
    //             filename starts at i,  ends at j. null terminate it.
    htext[j] = 0;
    strcpy(hashfilename, &htext[i]);

    if (user_trace)
        fprintf(stderr, "Target file file %s\n", hashfilename);

    //   No need to do any parsing of a box restriction.
    //   We got txtptr, txtstart, and txtlen from the caller.
    //
    textoffset = txtstart;
    textmaxoffset = txtstart + txtlen;

    {
        int nosuchfile;
        //    Check to see if we need to create the file; if we need
        //    it, we create it here.
        //
        nosuchfile = stat(hashfilename, &statbuf);
        if (nosuchfile)
        {
            //  Note that "my_header" is a local buffer.
            STATISTICS_FILE_HEADER_STRUCT my_header;
            strcpy((char *)my_header.file_ident_string,
                    "CRM114 Classdata FSCM V2 (hashed) ");

            if (user_trace)
                fprintf(stderr, "No such statistics file %s; must create it.\n",
                        hashfilename);
            //   Set the size of the hash table.
            if (sparse_spectrum_file_length == 0)
                sparse_spectrum_file_length = FSCM_DEFAULT_HASH_TABLE_SIZE;

            /////////////////////////////////////////////////
            //     START OF STANDARD HEADER SETUP
            //   header info chunk 0 - the header chunking info itself
            my_header.chunks[0].start = 0;
            my_header.chunks[0].length = sizeof(STATISTICS_FILE_HEADER_STRUCT);
            my_header.chunks[0].tag = 0;
            //
            //   header info, chunk 1 - the ident string
            my_header.chunks[1].start = my_header.chunks[0].length;
            my_header.chunks[1].length = STATISTICS_FILE_IDENT_STRING_MAX;
            my_header.chunks[1].tag = 1;
            //
            //   header info, chunk 2 - the "arbitrary data"
            my_header.chunks[2].start = my_header.chunks[1].start
                                        + my_header.chunks[1].length;
            my_header.chunks[2].length = STATISTICS_FILE_ARB_DATA;
            my_header.chunks[2].tag = 2;
            //    END OF STANDARD HEADER SETUP
            //////////////////////////////////////////////////

            //   header info chunk 3 - the prefix hastable, fixed size
            my_header.chunks[3].start = my_header.chunks[2].start
                                        + my_header.chunks[2].length;
            my_header.chunks[3].length = sparse_spectrum_file_length;
            my_header.chunks[3].tag = 3;
            //   ... and the length of that hash table will be, in cells:
            my_header.data[0] =
                sparse_spectrum_file_length / sizeof(FSCM_PREFIX_TABLE_CELL);
            //
            //   header info chunk 4 - the previous-seen pointers, growing.
            my_header.chunks[4].start = my_header.chunks[3].start
                                        + my_header.chunks[3].length;
            //    Although the starting length is really zero, zero is a sentinel
            //    so we start at 1 bucket further in...
            my_header.chunks[4].length = sizeof(FSCM_HASH_CHAIN_CELL);
            my_header.chunks[4].tag = 4;

            //    Write out the initial file header..
            hashf = fopen(hashfilename, "wb+");
            if (!hashf)
            {
                fev = nonfatalerror_ex(SRC_LOC(),
                        "\n Couldn't open your new CSS file %s for writing; errno=%d(%s)\n",
                        hashfilename,
                        errno,
                        errno_descr(errno));
                return fev;
            }
            else
            {
                if (1 != fwrite(&my_header,
                            sizeof(STATISTICS_FILE_HEADER_STRUCT),
                            1,
                            hashf))
                {
                    fev = nonfatalerror_ex(SRC_LOC(),
                            "\n Couldn't write header to file %s; errno=%d(%s)\n",
                            hashfilename, errno, errno_descr(errno));
                    fclose(hashf);
                    return fev;
                }
                fclose(hashf);
            }
        }
    }

    if (sense > 0)
    {
        /////////////////
        //    THIS PATH TO LEARN A TEXT
        //      1) Make room!  Append enough unsigned int zeroes that
        //         we will have space for our hashes.
        //      2) MMAP the file
        //      3) actually write the hashes
        //      4) adjust the initial-look table to point to those hashes;
        //          while modifying those hashes to point to the most recent
        //          previous hashes;
        //      5) MSYNC the output file.  As we already did a file system
        //          write it should not be necessary to do the mtime write.
        //
        /////////////////
        //    Write out the initial previous-seen hash table (all zeroes):

        {
            FSCM_PREFIX_TABLE_CELL my_zero;
            my_zero.index = 0;
            hashf = fopen(hashfilename, "ab+");
            if (!hashf)
            {
                fev = nonfatalerror_ex(SRC_LOC(),
                        "\n Couldn't open your CSS file %s for appending; errno=%d(%s)\n",
                        hashfilename,
                        errno,
                        errno_descr(errno));
                return fev;
            }
            else
            {
                for (i = 0; i < sparse_spectrum_file_length; i++)
                {
                    if (1 != fwrite(&my_zero,
                                sizeof(FSCM_PREFIX_TABLE_CELL),
                                1,
                                hashf))
                    {
                        fev = nonfatalerror_ex(SRC_LOC(),
                                "\n Couldn't write space fill for hashes to file %s; errno=%d(%s)\n",
                                hashfilename, errno, errno_descr(errno));
                        fclose(hashf);
                        return fev;
                    }
                }

                //    ... and write a single 32-bit zero to cover index zero.
                if (1 != fwrite(&(my_zero), sizeof(FSCM_PREFIX_TABLE_CELL), 1, hashf))
                {
                    fev = nonfatalerror_ex(SRC_LOC(),
                            "\n Couldn't write index 0 init data to file %s; errno=%d(%s)\n",
                            hashfilename, errno, errno_descr(errno));
                    fclose(hashf);
                    return fev;
                }

                //     All written; the file now exists with the right setup.
                fclose(hashf);
            }
        }

        //    We need one 32-bit zero for each character in the to-be-learned
        //    text; we'll soon clobber the ones that are in previously
        //    seen chains to chain members (the others can stay as zeroes).
        {
            FSCM_HASH_CHAIN_CELL my_zero;

            //   Now it's time to generate the actual string hashes.
            //   By default (no regex) it's a string kernel, length 3,
            //   but it can be any prefix one desires.
            //
            //   Generate the hashes.
            crm_vector_tokenize_selector
            (apb,                                             // the APB
                    txtptr,                                   // intput string
                    txtlen,                                   // how many bytes
                    txtstart,                                 // starting offset
                    NULL,                                     // custom tokenizer
                    NULL,                                     // custom matrix
                    unk_hashes,                               // where to put the hashed results
                    data_window_size / sizeof(unk_hashes[0]), //  max number of hashes
                    &unk_hashcount);                          // how many hashes we actually got

            if (internal_trace)
            {
                int display_count = CRM_MIN(16, unk_hashcount);
                fprintf(stderr, "Total %d hashes - first %d values:", unk_hashcount, display_count);
                for (i = 0; i < display_count; i++)
                {
                    if (i % 8 == 0)
                    {
                        fprintf(stderr, "\n#%d..%d:", i, CRM_MIN(i + 7, display_count - 1));
                    }
                    fprintf(stderr, " %08lx", (unsigned long int)unk_hashes[i]);
                }
                fprintf(stderr, "\n");
            }

            //  Now a nasty bit.  Because there might be retained hashes of the
            //  file, we need to force an unmap-by-name which will allow a remap
            //  with the new file length later on.
            if (internal_trace)
                fprintf(stderr,
                        "mmapping file %s for known state\n", hashfilename);
#if 0 // not needed for force_unmap to work as expected.
            crm_mmap_file
            (hashfilename, 0, 1, PROT_READ | PROT_WRITE,
                    MAP_SHARED, CRM_MADV_RANDOM, NULL);
#endif
            crm_force_munmap_filename(hashfilename);
            if (internal_trace)
                fprintf(stderr,
                        "UNmmapped file %s for known state\n", hashfilename);
            if (user_trace)
                fprintf(stderr, "Opening FSCM file %s for append.\n",
                        hashfilename);
            hashf = fopen(hashfilename, "ab+");
            if (!hashf)
            {
                fev = nonfatalerror_ex(SRC_LOC(),
                        "\n Couldn't open your CSS file %s for appending; errno=%d(%s)\n",
                        hashfilename,
                        errno,
                        errno_descr(errno));
                return fev;
            }
            else
            {
                if (user_trace)
                    fprintf(stderr, "Writing to hash file %s\n", hashfilename);
                my_zero.next = 0;
                for (i = 0; i < unk_hashcount + 2; i++)
                {
                    if (1 != fwrite(&my_zero,
                                sizeof(FSCM_HASH_CHAIN_CELL),
                                1,
                                hashf))
                    {
                        fev = nonfatalerror_ex(SRC_LOC(),
                                "\n Couldn't write hash sequence to file %s; errno=%d(%s)\n",
                                hashfilename, errno, errno_descr(errno));
                        fclose(hashf);
                        return fev;
                    }
                }
                fclose(hashf);
            }
        }

        //    Now the file has the space; we can now mmap it and set up our
        //    working pointers.

        stat(hashfilename, &statbuf);
        if (internal_trace)
            fprintf(stderr, "mmapping2 file %s\n", hashfilename);
        file_pointer =
            crm_mmap_file(hashfilename,
                    0, statbuf.st_size,
                    PROT_READ | PROT_WRITE,
                    MAP_SHARED, CRM_MADV_RANDOM, NULL);
        if (internal_trace)
            fprintf(stderr, "mmapped2 file %s\n", hashfilename);

        //  set up our pointers for the prefix table and the chains
        file_header = (STATISTICS_FILE_HEADER_STRUCT *)file_pointer;
        prefix_table_size = file_header->data[0];
        prefix_table = (FSCM_PREFIX_TABLE_CELL *)
                       &file_pointer[file_header->chunks[3].start];
        chains = (FSCM_HASH_CHAIN_CELL *)
                 &file_pointer[file_header->chunks[4].start];

        //    Note the little two-step dance to recover the starting location
        //    of this next chain.
        //
        //      Our new start here !
        newchainstart = file_header->chunks[4].length
                        / sizeof(FSCM_HASH_CHAIN_CELL);
        if (internal_trace)
            fprintf(stderr, "New chains start at offset %u\n", newchainstart);

        newchains = (FSCM_HASH_CHAIN_CELL *)
                    &chains[newchainstart];

        //      ... and this is the new updated length.
        file_header->chunks[4].length += (unk_hashcount + 2)
                                         * sizeof(FSCM_HASH_CHAIN_CELL);


        //   For each hash, insert it into the prefix table
        //   at the right place (that is, at hash mod prefix_table_size).
        //   If the table had a zero, it becomes nonzero.  If the table
        //   is nonzero, the corresponding slot in the newly written
        //   part of the chain zone gets the old address and the hashtable
        //   gets the new address.


        if (internal_trace)
        {
            fprintf(stderr,
                    "\n\nPrefix table size: %u, starting at offset %u\n",
                    prefix_table_size, newchainstart);
        }
        for (i = 0; i < unk_hashcount; i++)
        {
            unsigned int pti, cind;
            pti = unk_hashes[i] % prefix_table_size;
            //   Put in our chain entry
            if (internal_trace)
                fprintf(stderr,
                        "offset %d icn: %u hash %08lx tableslot %u"
                        " (prev offset %u)\n",
                        i,
                        i + newchainstart,
                        (unsigned long int)unk_hashes[i],
                        pti,
                        prefix_table[pti].index
                       );

            if (internal_trace)
            {
                cind = prefix_table[pti].index;
                while (cind != 0)
                {
                    fprintf(stderr,
                            " ... now location %u forwards to %u \n",
                            cind, chains[cind].next);
                    cind = chains[cind].next;
                }
            }
            //  Desired State:
            // chains [old] points to chains [older]chains (which is prior state)
            // chains[new] points to chains[old]
            // prefix_table [pti] = chains [new]
            chains[i + newchainstart].next = prefix_table[pti].index;
            prefix_table[pti].index = i + newchainstart;
        }

        //  forcibly let go of the mmap to force an msync
        if (internal_trace)
            fprintf(stderr, "UNmmapping file %s\n", hashfilename);
        crm_force_munmap_filename(hashfilename);
    }
    return 0;
}

//     A helper (but crucial) function - given an array of hashes and,
//     and a prefix_table / chain array pair, calculate the compressibility
//     of the hashes; this is munged (reports say best accuracy if
//     raised to the 1.5 power) and that is the returned value.
//
//
//   The algorithm here is actually suboptimal.
//   We take the first match presented (i.e. where unk_hashes[i] maps
//   to a nonzero cell in the prefix table) then for each additional
//   hash (i.e. unk_hashes[i+j] where there is a consecutive chain
//   in the chains[q, q+1, q+2]; we sum the J raised to the q_exponent
//   power for each such chain and report that result back.
//
//   The trick we employ here is that for each starting position q
//   all possible solutions are on q's chain, but also on q+1's
//   chain, on q+2's chain, on q+3's chain, and so on.
//
//   At this point, we can go two ways: we can use a single index (q)
//   chain and search forward through the entire chain, or we can use
//   multiple indices and an n-way merge of n chains to cut the number of
//   comparisons down significantly.
//
//   Which is optimal here?  Let's assume the texts obey something like
//   Zipf's law (Nth term is 1/Nth as likely as the 1st term).  Then the
//   probabile number of comparisons to find a string of length Q in
//   a text of length |T| by using the first method is
//          (1/N) + ( 1/ N) + ... = Q * (1/N) and we
//   can stop as soon as we find a string of Q elements (but since we
//   want the longest one, we check all |T| / N occurrences and that takes
//   |T| * Q / N^2 comparisons, and we need roughly |U| comparisons
//   overall, it's |T| * |U| * Q / N^2 .
//
//   In the second method (find all chains of length Q or longer) we
//   touch each of the Q chain members once. The number of members of
//   each chain is roughly |T| / N and we need Q such chains, so the
//   time is |T| * Q / N.  However, at the next position
//   we can simply drop the first chain's constraint; all of the other
//   calculations have already been done once; essentially this search
//   can be carried out *in parallel*; this cuts the work by a factor of
//   the length of the unkonown string.   However, dropping the constraint
//   is very tricky programming and so we aren't doing that right now.
//
//   We might form the sets where chain 1 and chain 2 are sequential in the
//   memory.  We then find where chains 2 and 3 are sequential in the
//   memory; where chains 3 and 4 are sequential, etc.  This is essentially
//   a relational database "join" operation, but with "in same record"
//   being replaced with "in sequential slots".
//
//   Assume we have a vector "V" of indices carrying each chain's current
//   putative pointer for a sequential match. (assume the V vector is
//   as long as the input string).
//
// 0) We initialize the indexing vector "V" to the first element of each
//    chan (or NULL for never-seen chains), and "start", "end", and
//    "total" to zero.
// 1) We set the chain index-index "C" to 0 (the leftmost member
//    of the index vector V).
// 2.0) We do a two-finger merge between the C'th and C + 1 chains,
//    moving one chain link further on the lesser index in each cycle.
//    (NB the current build method causes link indicess to be descending in
//    numerical value; so we step to the next link on the _greater_ of the
//    two chains.
//    2a) we stop the two-finger merge when:
//         V[C] == V[C+1] + 1
//         in which case
//              >> C++,
//              >> if C > end then end = C
//              >> go to 2.0 (repeat the 2-finger merge);
//    2b) if the two-finger merge runs out of chain on either the
//        C chain or the C++ chain (that is, a NULL):
//        >>  set the "out of chain" V element back to the innitial state;
//        >>  go back one chain pair ( "C = C--")
//            If V[C] == NULL
//              >> report out (end-start) as a maximal match (incrementing
//                 total by some amount),
//              >> move C over to "end" in the input stream
//              >> reset V[end+1]back to the chain starts.  Anything further
//                 hasn't been touched and so can be left alone.
//              >> go to 2.0
//
//    This algorithm still has the flaw thatn for the input string
//    ABCDE, the subchain BCDE is searched when the input string is at A.
//    and then again at B.  However, any local matches BC... are
//    gauranteed to be captured in the AB... match (we would look at
//    only the B's that follwed A's, not all of the B's even, so perhaps
//    this isn't much extra work after all.
//
//     Note: the reason for passing array of hashes rather than the original
//     string is that the calculation of the hashes is necessary and it's
//     more efficient to do it once and reuse.  Also, it means that the
//     hashes can be computed with a non-orthodox (i.e. not a string kernel)
//     method and that might take serious computes and many regexecs.

////////////////////////////////////////////////////////////////////////
//
//      Helper functions for the fast match.
//
////////////////////////////////////////////////////////////////////////

//     given a starting point, does it exist on a chain?
static unsigned int chain_search_one_chain
(
        FSCM_HASH_CHAIN_CELL *chains,
        unsigned int          chain_start,
        unsigned int          must_match
)
{
    unsigned int retval;
    unsigned int curch;
    static unsigned int cached_chain_start = 0;
    static unsigned int cached_chain_must_match = 0;
    static unsigned int cached_chain_prior = 0;

    retval = 0;
    curch = chain_start;

    if (internal_trace)
    {
        unsigned int j;
        j = chain_start;
        fprintf(stderr, "...chaintrace from %u: (next: %u)",
                j, chains[j].next);
        while (j != 0)
        {
            fprintf(stderr, " %u", j);
            j = chains[j].next;
        }
        fprintf(stderr, "\n");
    }

    if (internal_trace)
        fprintf(stderr, "   ... chain_search_one_chain chain %u mustmatch %u\n",
                chain_start, must_match);

    //  See if the caches can help us
    if (cached_chain_start == chain_start
        && cached_chain_must_match >= must_match)
    {
        if (internal_trace)
            fprintf(stderr, "   (using cache) ");
        curch = cached_chain_prior;
    }


    //  reset the cache either way.
    cached_chain_must_match = must_match;
    cached_chain_start = chain_start;

    while (curch > must_match && curch > 0)
    {
        if (internal_trace)
            fprintf(stderr, "   .... from %u to %u\n",
                    curch, chains[curch].next);
        cached_chain_prior = curch;
        curch = chains[curch].next;
    }
    if (curch == must_match)
    {
        if (internal_trace)
            fprintf(stderr, "   ...Success at chainindex %u\n", curch);
        return curch;
    }

    if (internal_trace)
        fprintf(stderr, "   ...Failed\n");
    return 0;
}


//    From here, how long does this chain run?
//
//      Do NOT implement this recursively, as a document matched against
//       itself will recurse for each character, so unless your compiler
//        can fix tail recursion, you'll blow the stack on long documents.
//
static unsigned int chain_run_length
(
        FSCM_HASH_CHAIN_CELL *chains,              //  the known-text chains
        unsigned int         *unk_indexes,         //  index vector to head of each chain
        unsigned int          unk_len,             //  length of index vctor
        unsigned int          starting_symbol,     //  symbol where we start
        unsigned int          starting_chain_index //  where it has to match (in chainspace)
)
{
    unsigned int offset;
    unsigned int search_chain;
    int in_loop;

    if (internal_trace)
        fprintf(stderr,
                "Looking for a chain member at symbol %u chainindex %u\n",
                starting_symbol, starting_chain_index);
    in_loop = 1;
    offset = 0;
    while ((starting_symbol + offset < unk_len) && in_loop)
    {
        search_chain = unk_indexes[starting_symbol + offset];
        if (internal_trace)
            fprintf(stderr,
                    "..searching at [symbol %u offset %u] chainindex %u\n",
                    starting_symbol, offset, search_chain);
        in_loop = chain_search_one_chain
                  (chains, search_chain, starting_chain_index + offset);
        offset++;
    }
    offset--;
    if (internal_trace)
        fprintf(stderr,
                "chain_run_length finished at chain index %u (offset %u)\n",
                starting_chain_index + offset, offset);
    return offset;
}

//        Note- the two-finger algorithm works- but it's actually kind of
//        hard to program in terms of it's asymmetry.  So instead, we use a
//        simpler repeated search algorithm with a cache at the bottom
//        level so we don't repeatedly search the same (failing) elements
//        of the chain).
//
//
//      NB: if this looks a little like how the genomics BLAST
//      match algorithm runs, yeah... I get that feeling too, although
//      I have not yet found a good description of how BLAST actually works
//      inside, and so can't say if this would be an improvement.  However,
//      it does beg the question of whether a BLAST-like algorithm might
//      work even _better_ for text matching.  Future note: use additional
//      flag <blast> to allow short interruptions of match stream.
//
//     longest_run_starting_here returns the length of the longest match
//      found that starts at exactly index[starting_symbol]
//
static unsigned int longest_run_starting_here
(
        FSCM_HASH_CHAIN_CELL *chains,         // array of interlaced chain cells
        unsigned int         *unk_indexes,    //  index vector to head of each chain
        unsigned int          unk_len,        //  length of index vector
        unsigned int          starting_symbol //  index of starting symbol
)
{
    unsigned int chain_index_start;    //  Where in the primary chain we are.
    unsigned int this_run, max_run;

    if (internal_trace)
        fprintf(stderr, "\n*** longest_run: starting at symbol %u\n",
                starting_symbol);

    chain_index_start = unk_indexes[starting_symbol];
    this_run = max_run = 0;
    if (chain_index_start == 0)
    {
        if (internal_trace)
            fprintf(stderr, "longest_run: no starting chain here; returning\n");
        return 0;    // edge case - no match
    }
    //   If we got here, we had at +least+ a max run of one match found
    //    (that being chain_index_start)
    this_run = max_run = 1;
    if (internal_trace)
        fprintf(stderr, "longest_run: found a first entry (chain %u)\n",
                chain_index_start);

    while (chain_index_start != 0)
    {
        if (internal_trace)
            fprintf(stderr, "Scanning chain starting at %u\n",
                    chain_index_start);
        this_run = chain_run_length
                   (chains, unk_indexes, unk_len,
                starting_symbol + 1, chain_index_start + 1) + 1;
        //
        if (internal_trace)
            fprintf(stderr,
                    "longest_run: chainindex run at %u is length %u\n",
                    chain_index_start, this_run);
        if (this_run > max_run)
        {
            if (internal_trace)
                fprintf(stderr, "longest_run: new maximum\n");
            max_run = this_run;
        }
        else
        {
            if (internal_trace)
                fprintf(stderr, "longest_run: not an improvement\n");
        }
        //     And go around again till we hit a zero chain index
        chain_index_start = chains[chain_index_start].next;
    }
    if (internal_trace)
        fprintf(stderr, "Final result at symbol %u run length is %u\n",
                starting_symbol, max_run);
    return max_run;
}

//     compress_me is the top-level calculating routine which calls
//     all of the prior routines in the right way.

static double compress_me
(
        unsigned int         *unk_indexes, //  prefix chain-entry table
        unsigned int          unk_len,     // length of the entry table
        FSCM_HASH_CHAIN_CELL *chains,      // array of interlaced chain cells
        double                q_exponent   // exponent of match
)
{
    unsigned int current_symbol, this_run_length;
    double total_score, incr_score;

    total_score = 0.0;
    current_symbol = 0;

    while (current_symbol < unk_len)
    {
        this_run_length = longest_run_starting_here
                          (chains, unk_indexes, unk_len, current_symbol);
        incr_score = 0;
        if (this_run_length > 0)
            incr_score = pow(this_run_length, q_exponent);
        total_score = total_score + incr_score;
        if (internal_trace)
            fprintf(stderr,  "Offset %u compresses %u score %lf\n",
                    current_symbol, this_run_length, incr_score);
        current_symbol = current_symbol + this_run_length;
        if (this_run_length == 0)
            current_symbol++;
    }
    return total_score;
}


//      How to do an Improved FSCM CLASSIFY of some text.
//
int crm_fast_substring_classify(CSL_CELL *csl, ARGPARSE_BLOCK *apb,
        char *txtptr, int txtstart, int txtlen)
{
    //      classify the compressed version of this text
    //      as belonging to a particular type.
    //
    //       Much of this code should look very familiar- it's cribbed from
    //       the code for LEARN
    //
    int i, k;

    char ptext[MAX_PATTERN]; //  the regex pattern
    int plen;

    //  the hash file names
    int htext_maxlen = MAX_PATTERN + MAX_CLASSIFIERS * MAX_FILE_NAME_LEN;

    //  the match statistics variable
    char stext[MAX_PATTERN + MAX_CLASSIFIERS * (MAX_FILE_NAME_LEN + 100)];
    int stext_maxlen = MAX_PATTERN + MAX_CLASSIFIERS * (MAX_FILE_NAME_LEN + 100);

    int slen;
    char svrbl[MAX_PATTERN]; //  the match statistics text buffer
    int svlen;
    int fnameoffset;
    int eflags;
    int cflags;
    int use_unique;
    int not_microgroom = 1;
    int use_unigram_features;

    struct stat statbuf;    //  for statting the hash file
    regex_t regcb;

    //   Total hits per statistics file - one hit is nominally equivalent to
    //   compressing away one byte
    //  int totalhits[MAX_CLASSIFIERS];
    //
    //  int totalfeatures;   //  total features
    double tprob;       //  total probability in the "success" domain.

    double ptc[MAX_CLASSIFIERS]; // current running probability of this class

    //    Classificer Coding Clarification- we'll do one file at a time, so
    //    these variables are moved to point to different statistics files
    //    in a loop.
    char *file_pointer;
    STATISTICS_FILE_HEADER_STRUCT *file_header; // the
    FSCM_PREFIX_TABLE_CELL *prefix_table;       //  the prefix indexing table,
    unsigned int prefix_table_size = 200000;
    FSCM_HASH_CHAIN_CELL *chains; //  the chain area
    unsigned int *unk_indexes;

    int fn_start_here;
    char htext[MAX_PATTERN];    // the text of the names (unparsed)
    int htextlen;
    char hfname[MAX_PATTERN];   //  the current file name
    int fnstart, fnlen;
    char hashfilenames[MAX_CLASSIFIERS][MAX_FILE_NAME_LEN]; //  names (parsed)
    int hashfilebytelens[MAX_CLASSIFIERS];
    int succhash;       // how many hashfilenames are "success" files?
    int vbar_seen;      // did we see '|' in classify's args?
    int maxhash;
    int bestseen;
    double scores[MAX_CLASSIFIERS]; //  per-classifier raw score.

    // We'll generate our unknown string's hashes directly into tempbuf.
    int unk_hashcount;
    crmhash_t *unk_hashes;

    unk_hashes = (crmhash_t *)tempbuf;

    if (internal_trace)
        fprintf(stderr, "executing a Fast Substring Compression CLASSIFY\n");


    //           extract the hash file names
    htextlen = crm_get_pgm_arg(htext, htext_maxlen, apb->p1start, apb->p1len);
    htextlen = crm_nexpandvar(htext, htextlen, htext_maxlen);
    CRM_ASSERT(htextlen < MAX_PATTERN);

    //           extract the "this is a compressible character" regex.
    //     Note that by and large this is not used!
    //
    plen = crm_get_pgm_arg(ptext, MAX_PATTERN, apb->s1start, apb->s1len);
    plen = crm_nexpandvar(ptext, plen, MAX_PATTERN);

    //            extract the optional "match statistics" variable
    //
    svlen = crm_get_pgm_arg(svrbl, MAX_PATTERN, apb->p2start, apb->p2len);
    svlen = crm_nexpandvar(svrbl, svlen, MAX_PATTERN);
    CRM_ASSERT(svlen < MAX_PATTERN);
    {
        int vstart, vlen;
        if (crm_nextword(svrbl, svlen, 0, &vstart, &vlen))
        {
            memmove(svrbl, &svrbl[vstart], vlen);
            svlen = vlen;
            svrbl[vlen] = 0;
        }
        else
        {
            svlen = 0;
            svrbl[0] = 0;
        }
    }

    if (user_trace)
        fprintf(stderr, "Status out var %s (len %d)\n",
                svrbl, svlen);

    //     status variable's text (used for output stats)
    //
    stext[0] = 0;
    slen = 0;

    //            set our flags, if needed.  The defaults are
    //            "case"
    cflags = REG_EXTENDED;
    eflags = 0;

    if (apb->sflags & CRM_NOCASE)
    {
        cflags = REG_ICASE | cflags;
        eflags = 1;
    }

    not_microgroom = 1;
    if (apb->sflags & CRM_MICROGROOM)
    {
        not_microgroom = 0;
        if (user_trace)
            fprintf(stderr, " disabling fast-skip optimization.\n");
    }

    use_unique = 0;
    if (apb->sflags & CRM_UNIQUE)
    {
        use_unique = 1;
        if (user_trace)
            fprintf(stderr, " unique engaged - repeated features are ignored\n");
    }

    use_unigram_features = 0;
    if (apb->sflags & CRM_UNIGRAM)
    {
        use_unigram_features = 1;
        if (user_trace)
            fprintf(stderr, " using only unigram features.\n");
    }


    //      Create our hashes; we do this once outside the loop and
    //      thus save time inside the loop.



    crm_vector_tokenize_selector
    (apb,                                             // the APB
            txtptr,                                   // intput string
            txtlen,                                   // how many bytes
            txtstart,                                 // starting offset
            NULL,                                     // custom tokenizer
            NULL,                                     // custom coeff matrix
            unk_hashes,                               // where to put the hashed results
            data_window_size / sizeof(unk_hashes[0]), //  max number of hashes
            &unk_hashcount);                          // how many hashes we actually got

    if (internal_trace)
    {
        int display_count = CRM_MIN(16, unk_hashcount);
        fprintf(stderr, "Total %d hashes - first %d values:", unk_hashcount, display_count);
        for (i = 0; i < display_count; i++)
        {
            if (i % 8 == 0)
            {
                fprintf(stderr, "\n#%d..%d:", i, CRM_MIN(i + 7, display_count - 1));
            }
            fprintf(stderr, " %08lx", (unsigned long int)unk_hashes[i]);
        }
        fprintf(stderr, "\n");
    }


    if (user_trace)
        fprintf(stderr, "Total of %u initial features.\n", unk_hashcount);

    unk_indexes = (unsigned int *)calloc(unk_hashcount + 1, sizeof(unk_indexes[0]));

    //       Now, we parse the filenames and do a mmap/match/munmap loop
    //       on each file.  The resulting number of hits is stored in the
    //       the loop to open the files.

    vbar_seen = 0;
    maxhash = 0;
    succhash = 0;
    fnameoffset = 0;

    //    now, get the file names and mmap each file
    //     get the file name (grody and non-8-bit-safe, but doesn't matter
    //     because the result is used for open() and nothing else.
    //   GROT GROT GROT  this isn't NULL-clean on filenames.  But then
    //    again, stdio.h itself isn't NULL-clean on filenames.
    if (user_trace)
        fprintf(stderr, "Classify list: -%s- \n", htext);
    fn_start_here = 0;
    fnlen = 1;

    while (fnlen > 0 && ((maxhash < MAX_CLASSIFIERS - 1)))
    {
        if (crm_nextword(htext, htextlen, fn_start_here, &fnstart, &fnlen)
            && fnlen > 0)
        {
            strncpy(hfname, &htext[fnstart], fnlen);
            fn_start_here = fnstart + fnlen + 1;
            hfname[fnlen] = 0;

            strncpy(hashfilenames[maxhash], hfname, fnlen);
            hashfilenames[maxhash][fnlen] = '\000';
            if (user_trace)
                fprintf(stderr,
                        "Classifying with file -%s- succhash=%d, maxhash=%d\n",
                        hashfilenames[maxhash], succhash, maxhash);
            if (hfname[0] == '|' && hfname[1] == '\000')
            {
                if (vbar_seen)
                {
                    nonfatalerror
                    ("Only one ' | ' allowed in a CLASSIFY. \n",
                            "We'll ignore it for now.");
                }
                else
                {
                    succhash = maxhash;
                }
                vbar_seen++;
            }
            else
            {
                //  be sure the file exists
                //             stat the file to get it's length
                k = stat(hfname, &statbuf);
                //             quick check- does the file even exist?
                if (k != 0)
                {
                    nonfatalerror
                    ("Nonexistent Classify table named: ",
                            hfname);
                }
                else
                {
                    //  file exists - do the open/process/close
                    //
                    hashfilebytelens[maxhash] = statbuf.st_size;

                    //  mmap the hash file into memory so we can bitwhack it
                    file_pointer =
                        crm_mmap_file(hfname,
                                0, hashfilebytelens[maxhash],
                                PROT_READ,
                                MAP_SHARED, CRM_MADV_RANDOM,
                                NULL);

                    if (file_pointer == MAP_FAILED)
                    {
                        nonfatalerror
                        ("Couldn't memory-map the table file :",
                                hfname);
                    }
                    else
                    {
                        //    GROT GROT GROT
                        //    GROT  Actually implement this someday!!!
                        //     Check to see if this file is the right version
                        //    GROT GROT GROT

                        //  set up our pointers for the prefix table and
                        //   the chains
                        file_header =
                            (STATISTICS_FILE_HEADER_STRUCT *)file_pointer;
                        prefix_table_size = file_header->data[0];
                        if (internal_trace)
                            fprintf(stderr,
                                    "Prefix table at %u, chains at %u\n",
                                    (unsigned int)file_header->chunks[3].start,
                                    (unsigned int)file_header->chunks[4].start);

                        prefix_table = (FSCM_PREFIX_TABLE_CELL *)
                                       &file_pointer[file_header->chunks[3].start];

                        chains = (FSCM_HASH_CHAIN_CELL *)
                                 &file_pointer[file_header->chunks[4].start];

                        if (internal_trace)
                            fprintf(stderr,
                                    " Prefix table size = %d\n",
                                    prefix_table_size);
                        //    initialize the index vector to the chain starts
                        //    (some of which are NULL).
                        for (i = 0; i < unk_hashcount; i++)
                        {
                            unsigned int uhmpts;
                            uhmpts = unk_hashes[i] % prefix_table_size;
                            unk_indexes[i] = prefix_table[uhmpts].index;
                            if (internal_trace)
                            {
                                fprintf(stderr,
                                        "unk_hashes[%d] = %08lx, index = %u, prefix_table[%u] = %u\n",
                                        i, (unsigned long int)unk_hashes[i], uhmpts,
                                        uhmpts, prefix_table[uhmpts].index);
                            }
                        }
                        //  Now for the nitty-gritty - run the compression
                        //   of the unknown versus tis statistics file.
                        scores[maxhash] = compress_me
                                          (unk_indexes,
                                unk_hashcount,
                                chains,
                                1.5);
                    }
                    maxhash++;
                }
            }
            if (maxhash > MAX_CLASSIFIERS - 1)
                nonfatalerror("Too many classifier files.",
                        "Some may have been disregarded");
        }
    }

    //
    //    If there is no '|', then all files are "success" files.
    if (succhash == 0)
        succhash = maxhash;

    if (user_trace)
        fprintf(stderr, "Running with %d files for success out of %d files\n",
                succhash, maxhash);

    // sanity checks...  Uncomment for super-strict CLASSIFY.
    //
    //	do we have at least 1 valid .css files?
    if (maxhash == 0)
    {
        nonfatalerror
        ("Couldn't open at least one .css files for classify().",
                "");
    }
    //	do we have at least 1 valid .css file at both sides of '|'?
    // if (!vbar_seen || succhash < 0 || (maxhash < succhash + 2))
    //  {
    //    nonfatalerror (
    //      "Couldn't open at least 1 .css file per SUCC | FAIL category "
    //	" for classify().\n","Hope you know what are you doing.");
    //  }

    ///////////////////////////////////////////////////////////
    //
    //   To translate score (which is exponentiated compression) we
    //   just normalize to a sum of 1.000 .  Note that we start off
    //   with a minimum per-class score of "tiny" to avoid divide-by-zero
    //   problems (zero scores on everything => divide by zero)

    tprob = 0.0;
    for (i = 0; i < MAX_CLASSIFIERS; i++)
        ptc[i] = 0.0;

    for (i = 0; i < maxhash; i++)
    {
        ptc[i] = scores[i];
        if (ptc[i] < 0.001)
            ptc[i] =   0.001;
        tprob = tprob + ptc[i];
    }
    //     Renormalize probabilities
    for (i = 0; i < maxhash; i++)
        ptc[i] = ptc[i] / tprob;

    if (user_trace)
    {
        for (k = 0; k < maxhash; k++)
            fprintf(stderr, "Match for file %d: compress: %f prob: %f\n",
                    k, scores[k], ptc[k]);
    }

    bestseen = 0;
    for (i = 0; i < maxhash; i++)
        if (ptc[i] > ptc[bestseen])
            bestseen = i;

    //     Reset tprob to contain sum of probabilities of success classes.
    tprob = 0.0;
    for (k = 0; k < succhash; k++)
        tprob = tprob + ptc[k];

    if (svlen > 0)
    {
        char buf[1024];
        double accumulator;
        double remainder;
        double overall_pR;
        int m;
        buf[0] = '\000';
        accumulator = 1000 * DBL_MIN;
        for (m = 0; m < succhash; m++)
        {
            accumulator = accumulator + ptc[m];
        }
        remainder = 1000 * DBL_MIN;
        for (m = succhash; m < maxhash; m++)
        {
            remainder = remainder + ptc[m];
        }

        if (internal_trace)
            fprintf(stderr, "succ: %d, max: %d, acc: %lf, rem: %lf\n",
                    succhash, maxhash, accumulator, remainder);

        overall_pR = 10 * (log10(accumulator) - log10(remainder));

        //   note also that strcat _accumulates_ in stext.
        //  There would be a possible buffer overflow except that _we_ control
        //   what gets written here.  So it's no biggie.

        if (tprob > 0.5000)
        {
            sprintf(buf, "CLASSIFY succeeds; success probability: %6.4f  pR: %6.4f\n", tprob, overall_pR);
        }
        else
        {
            sprintf(buf, "CLASSIFY fails; success probability: %6.4f  pR: %6.4f\n", tprob, overall_pR);
        }
        if (strlen(stext) + strlen(buf) <= stext_maxlen)
            strcat(stext, buf);

        remainder = 1000 * DBL_MIN;
        for (m = 0; m < maxhash; m++)
            if (bestseen != m)
            {
                remainder = remainder + ptc[m];
            }

        sprintf(buf,
                "Best match to file #%d (%s) prob: %6.4f  pR: %6.4f \n",
                bestseen,
                hashfilenames[bestseen],
                ptc[bestseen],
                10 * (log10(ptc[bestseen]) - log10(remainder))
               );

        if (strlen(stext) + strlen(buf) <= stext_maxlen)
            strcat(stext, buf);
        sprintf(buf, "Total features in input file: %d\n", unk_hashcount);
        if (strlen(stext) + strlen(buf) <= stext_maxlen)
            strcat(stext, buf);
        for (k = 0; k < maxhash; k++)
        {
            // int m; -- 'local 'm' hides decl of the same name in outer scope, so says MSVC. And we don't need this one to do that, so use the outer scope one.
            remainder = 1000 * DBL_MIN;
            for (m = 0; m < maxhash; m++)
                if (k != m)
                {
                    remainder = remainder + ptc[m];
                }
            sprintf(buf,
                    "#%d (%s):" \
                    " features: %d, compression: %6.2f, prob: %3.2e, pR: %6.2f \n",
                    k,
                    hashfilenames[k],
                    (int)(hashfilebytelens[k] / sizeof(FSCM_HASH_CHAIN_CELL))
                    - prefix_table_size, // GROT GROT shouldn't this be file dependend?  GROT GROT
                    scores[k],
                    ptc[k],
                    10 * (log10(ptc[k]) - log10(remainder)));

            // strcat (stext, buf);
            if (strlen(stext) + strlen(buf) <= stext_maxlen)
                strcat(stext, buf);
        }
        // check here if we got enough room in stext to stuff everything
        // perhaps we'd better rise a nonfatalerror, instead of just
        // whining on stderr
        if (strcmp(&(stext[strlen(stext) - strlen(buf)]), buf) != 0)
        {
            nonfatalerror("WARNING: not enough room in the buffer to create "
                          "the statistics text.  Perhaps you could try bigger "
                          "values for MAX_CLASSIFIERS or MAX_FILE_NAME_LEN?",
                    " ");
        }
        crm_destructive_alter_nvariable(svrbl, svlen,
                stext, (int)strlen(stext));
    }


    //  cleanup time!

    //  and let go of the regex buffery
    if (ptext[0] != '\0')
        crm_regfree(&regcb);


    if (tprob > 0.5000)
    {
        //   all done... if we got here, we should just continue execution
        if (user_trace)
            fprintf(stderr, "CLASSIFY was a SUCCESS, continuing execution.\n");
    }
    else
    {
        if (user_trace)
            fprintf(stderr, "CLASSIFY was a FAIL, skipping forward.\n");
        //    and do what we do for a FAIL here
        csl->cstmt = csl->mct[csl->cstmt]->fail_index - 1;
        csl->aliusstk[csl->mct[csl->cstmt]->nest_level] = -1;
        return 0;
    }
    //
    // regcomp_failed:
    return 0;
}


#else /* CRM_WITHOUT_FSCM */

int crm_expr_fscm_learn(CSL_CELL *csl, ARGPARSE_BLOCK *apb,
        char *txtptr, int txtstart, int txtlen)
{
    return nonfatalerror_ex(SRC_LOC(),
            "ERROR: the %s classifier has not been incorporated in this CRM114 build.\n"
            "You may want to run 'crm -v' to see which classifiers are available.\n",
            "FSCM");
}


int crm_expr_fscm_classify(CSL_CELL *csl, ARGPARSE_BLOCK *apb,
        char *txtptr, int txtstart, int txtlen)
{
    return nonfatalerror_ex(SRC_LOC(),
            "ERROR: the %s classifier has not been incorporated in this CRM114 build.\n"
            "You may want to run 'crm -v' to see which classifiers are available.\n",
            "FSCM");
}

#endif




int crm_expr_fscm_css_merge(CSL_CELL *csl, ARGPARSE_BLOCK *apb,
        char *txtptr, int txtstart, int txtlen)
{
    return nonfatalerror_ex(SRC_LOC(),
            "ERROR: the %s classifier tools have not been incorporated in this CRM114 build.\n"
            "You may want to run 'crm -v' to see which classifiers are available.\n",
            "FSCM");
}


int crm_expr_fscm_css_diff(CSL_CELL *csl, ARGPARSE_BLOCK *apb,
        char *txtptr, int txtstart, int txtlen)
{
    return nonfatalerror_ex(SRC_LOC(),
            "ERROR: the %s classifier tools have not been incorporated in this CRM114 build.\n"
            "You may want to run 'crm -v' to see which classifiers are available.\n",
            "FSCM");
}


int crm_expr_fscm_css_backup(CSL_CELL *csl, ARGPARSE_BLOCK *apb,
        char *txtptr, int txtstart, int txtlen)
{
    return nonfatalerror_ex(SRC_LOC(),
            "ERROR: the %s classifier tools have not been incorporated in this CRM114 build.\n"
            "You may want to run 'crm -v' to see which classifiers are available.\n",
            "FSCM");
}


int crm_expr_fscm_css_restore(CSL_CELL *csl, ARGPARSE_BLOCK *apb,
        char *txtptr, int txtstart, int txtlen)
{
    return nonfatalerror_ex(SRC_LOC(),
            "ERROR: the %s classifier tools have not been incorporated in this CRM114 build.\n"
            "You may want to run 'crm -v' to see which classifiers are available.\n",
            "FSCM");
}


int crm_expr_fscm_css_info(CSL_CELL *csl, ARGPARSE_BLOCK *apb,
        char *txtptr, int txtstart, int txtlen)
{
    return nonfatalerror_ex(SRC_LOC(),
            "ERROR: the %s classifier tools have not been incorporated in this CRM114 build.\n"
            "You may want to run 'crm -v' to see which classifiers are available.\n",
            "FSCM");
}


int crm_expr_fscm_css_analyze(CSL_CELL *csl, ARGPARSE_BLOCK *apb,
        char *txtptr, int txtstart, int txtlen)
{
    return nonfatalerror_ex(SRC_LOC(),
            "ERROR: the %s classifier tools have not been incorporated in this CRM114 build.\n"
            "You may want to run 'crm -v' to see which classifiers are available.\n",
            "FSCM");
}


int crm_expr_fscm_css_create(CSL_CELL *csl, ARGPARSE_BLOCK *apb,
        char *txtptr, int txtstart, int txtlen)
{
    return nonfatalerror_ex(SRC_LOC(),
            "ERROR: the %s classifier tools have not been incorporated in this CRM114 build.\n"
            "You may want to run 'crm -v' to see which classifiers are available.\n",
            "FSCM");
}


int crm_expr_fscm_css_migrate(CSL_CELL *csl, ARGPARSE_BLOCK *apb,
        char *txtptr, int txtstart, int txtlen)
{
    return nonfatalerror_ex(SRC_LOC(),
            "ERROR: the %s classifier tools have not been incorporated in this CRM114 build.\n"
            "You may want to run 'crm -v' to see which classifiers are available.\n",
            "FSCM");
}


