#include <R.h>
#include <Rinternals.h>
#include <string.h>

/* ================================================================
 * FNV-1a hash for integer arrays
 * ================================================================ */
static unsigned int hash_ints(const int *arr, int n, unsigned int mod) {
    unsigned int h = 2166136261u;
    for (int i = 0; i < n; i++) {
        h ^= (unsigned int)arr[i];
        h *= 16777619u;
    }
    return h % mod;
}

/* ================================================================
 * FNV-1a hash for strings
 * ================================================================ */
static unsigned int hash_str(const char *s, unsigned int mod) {
    unsigned int h = 2166136261u;
    while (*s) {
        h ^= (unsigned char)*s++;
        h *= 16777619u;
    }
    return h % mod;
}

/* ================================================================
 * Token map: string -> int encoding
 * ================================================================ */
#define TM_BUCKETS 4096

typedef struct tm_entry {
    const char *str;
    int id;
    struct tm_entry *next;
} tm_entry;

typedef struct {
    tm_entry *buckets[TM_BUCKETS];
    const char **id_to_str;
    int next_id;
    int capacity;
} token_map;

static void tm_init(token_map *tm) {
    memset(tm->buckets, 0, sizeof(tm->buckets));
    tm->next_id = 0;
    tm->capacity = 2048;
    tm->id_to_str = (const char **)R_alloc((size_t)tm->capacity,
                                            sizeof(const char *));
}

static int tm_encode(token_map *tm, const char *str) {
    unsigned int idx = hash_str(str, TM_BUCKETS);
    tm_entry *e = tm->buckets[idx];
    while (e) {
        if (strcmp(e->str, str) == 0) return e->id;
        e = e->next;
    }
    int id = tm->next_id++;
    if (id >= tm->capacity) {
        int new_cap = tm->capacity * 2;
        const char **new_arr = (const char **)R_alloc((size_t)new_cap,
                                                       sizeof(const char *));
        memcpy(new_arr, tm->id_to_str,
               (size_t)tm->capacity * sizeof(const char *));
        tm->id_to_str = new_arr;
        tm->capacity = new_cap;
    }
    tm->id_to_str[id] = str;

    tm_entry *ne = (tm_entry *)R_alloc(1, sizeof(tm_entry));
    ne->str = str;
    ne->id = id;
    ne->next = tm->buckets[idx];
    tm->buckets[idx] = ne;
    return id;
}

/* ================================================================
 * N-gram frequency hash table
 * ================================================================ */
#define NG_BUCKETS 8192
#define MASK_VAL (-1)

typedef struct ng_entry {
    int *toks;
    int n;
    int count;
    struct ng_entry *next;
} ng_entry;

typedef struct {
    ng_entry *buckets[NG_BUCKETS];
} ngram_ht;

static void ng_init(ngram_ht *ht) {
    memset(ht->buckets, 0, sizeof(ht->buckets));
}

static void ng_insert(ngram_ht *ht, const int *toks, int n) {
    unsigned int idx = hash_ints(toks, n, NG_BUCKETS);
    ng_entry *e = ht->buckets[idx];
    while (e) {
        if (e->n == n) {
            int eq = 1;
            for (int i = 0; i < n; i++) {
                if (e->toks[i] != toks[i]) { eq = 0; break; }
            }
            if (eq) { e->count++; return; }
        }
        e = e->next;
    }
    ng_entry *ne = (ng_entry *)R_alloc(1, sizeof(ng_entry));
    ne->toks = (int *)R_alloc((size_t)n, sizeof(int));
    memcpy(ne->toks, toks, (size_t)n * sizeof(int));
    ne->n = n;
    ne->count = 1;
    ne->next = ht->buckets[idx];
    ht->buckets[idx] = ne;
}

/* ================================================================
 * Build combination string from token IDs (concatenated, no spaces)
 * ================================================================ */
static const char *build_combination(token_map *tm, const int *toks, int n) {
    size_t total = 0;
    for (int i = 0; i < n; i++) {
        total += strlen(tm->id_to_str[toks[i]]);
    }
    char *buf = (char *)R_alloc(total + 1, sizeof(char));
    char *p = buf;
    for (int i = 0; i < n; i++) {
        const char *s = tm->id_to_str[toks[i]];
        size_t len = strlen(s);
        memcpy(p, s, len);
        p += len;
    }
    *p = '\0';
    return buf;
}

/* ================================================================
 * C_ngram_core: main n-gram analysis with masking
 *
 * Arguments:
 *   lines_sexp    - list of character vectors (tokenized lines)
 *   max_n_sexp    - integer(1), maximum n-gram length
 *   min_freq_sexp - integer vector, minimum frequency per length
 *
 * Returns: list with components combination, length, frequency
 * ================================================================ */
SEXP C_ngram_core(SEXP lines_sexp, SEXP max_n_sexp, SEXP min_freq_sexp) {
    int n_lines = LENGTH(lines_sexp);
    int max_n   = INTEGER(max_n_sexp)[0];
    int *min_freq = INTEGER(min_freq_sexp);
    int mf_len    = LENGTH(min_freq_sexp);

    /* --- Encode tokens to integers --- */
    token_map tm;
    tm_init(&tm);

    int **lines = (int **)R_alloc((size_t)n_lines, sizeof(int *));
    int *lens   = (int *)R_alloc((size_t)n_lines, sizeof(int));

    for (int li = 0; li < n_lines; li++) {
        SEXP line = VECTOR_ELT(lines_sexp, li);
        int len = LENGTH(line);
        lens[li] = len;
        lines[li] = (int *)R_alloc((size_t)len, sizeof(int));
        for (int j = 0; j < len; j++) {
            lines[li][j] = tm_encode(&tm, CHAR(STRING_ELT(line, j)));
        }
    }

    /* --- Working copy (will be masked in-place) --- */
    int **work = (int **)R_alloc((size_t)n_lines, sizeof(int *));
    for (int li = 0; li < n_lines; li++) {
        work[li] = (int *)R_alloc((size_t)lens[li], sizeof(int));
        memcpy(work[li], lines[li], (size_t)lens[li] * sizeof(int));
    }

    /* --- Dynamic result arrays --- */
    int res_cap = 256, res_n = 0;
    const char **res_comb = (const char **)R_alloc((size_t)res_cap,
                                                    sizeof(const char *));
    int *res_len  = (int *)R_alloc((size_t)res_cap, sizeof(int));
    int *res_freq = (int *)R_alloc((size_t)res_cap, sizeof(int));

    /* --- Main loop: n from max_n down to 1 --- */
    for (int n = max_n; n >= 1; n--) {
        int mf = (n <= mf_len) ? min_freq[n - 1] : min_freq[mf_len - 1];

        /* Count n-grams in working copy */
        ngram_ht ht;
        ng_init(&ht);

        for (int li = 0; li < n_lines; li++) {
            if (lens[li] < n) continue;
            for (int pos = 0; pos <= lens[li] - n; pos++) {
                int skip = 0;
                for (int k = 0; k < n; k++) {
                    if (work[li][pos + k] < 0) { skip = 1; break; }
                }
                if (skip) continue;
                ng_insert(&ht, &work[li][pos], n);
            }
        }

        /* Collect frequent n-grams */
        int fc = 0, f_cap = 128;
        ng_entry **freq = (ng_entry **)R_alloc((size_t)f_cap,
                                                sizeof(ng_entry *));

        for (int b = 0; b < NG_BUCKETS; b++) {
            for (ng_entry *e = ht.buckets[b]; e; e = e->next) {
                if (e->count >= mf) {
                    if (fc >= f_cap) {
                        int nc = f_cap * 2;
                        ng_entry **na = (ng_entry **)R_alloc((size_t)nc,
                                                              sizeof(ng_entry *));
                        memcpy(na, freq, (size_t)f_cap * sizeof(ng_entry *));
                        freq = na;
                        f_cap = nc;
                    }
                    freq[fc++] = e;
                }
            }
        }

        if (fc == 0) continue;

        /* Sort by count descending (insertion sort, fc is small) */
        for (int i = 1; i < fc; i++) {
            ng_entry *tmp = freq[i];
            int j = i - 1;
            while (j >= 0 && freq[j]->count < tmp->count) {
                freq[j + 1] = freq[j];
                j--;
            }
            freq[j + 1] = tmp;
        }

        /* Store results */
        for (int fi = 0; fi < fc; fi++) {
            if (res_n >= res_cap) {
                int nc = res_cap * 2;
                const char **nc1 = (const char **)R_alloc((size_t)nc,
                                                           sizeof(const char *));
                int *nc2 = (int *)R_alloc((size_t)nc, sizeof(int));
                int *nc3 = (int *)R_alloc((size_t)nc, sizeof(int));
                memcpy(nc1, res_comb, (size_t)res_cap * sizeof(const char *));
                memcpy(nc2, res_len,  (size_t)res_cap * sizeof(int));
                memcpy(nc3, res_freq, (size_t)res_cap * sizeof(int));
                res_comb = nc1;
                res_len  = nc2;
                res_freq = nc3;
                res_cap  = nc;
            }
            res_comb[res_n] = build_combination(&tm, freq[fi]->toks, n);
            res_len[res_n]  = n;
            res_freq[res_n] = freq[fi]->count;
            res_n++;
        }

        /* Mask duplicates: keep first occurrence, mask the rest */
        for (int fi = 0; fi < fc; fi++) {
            ng_entry *e = freq[fi];
            int first = 0;

            for (int li = 0; li < n_lines; li++) {
                if (lens[li] < n) continue;
                int pos = 0;
                while (pos <= lens[li] - n) {
                    int match = 1;
                    for (int k = 0; k < n; k++) {
                        if (work[li][pos + k] != e->toks[k]) {
                            match = 0;
                            break;
                        }
                    }
                    if (match) {
                        if (!first) {
                            first = 1;
                            pos++;
                        } else {
                            for (int k = 0; k < n; k++)
                                work[li][pos + k] = MASK_VAL;
                            pos += n;
                        }
                    } else {
                        pos++;
                    }
                }
            }
        }
    }

    /* --- Build R result --- */
    SEXP result = PROTECT(allocVector(VECSXP, 3));
    SEXP r_comb = PROTECT(allocVector(STRSXP, res_n));
    SEXP r_len  = PROTECT(allocVector(INTSXP, res_n));
    SEXP r_freq = PROTECT(allocVector(INTSXP, res_n));

    for (int i = 0; i < res_n; i++) {
        SET_STRING_ELT(r_comb, i, mkCharCE(res_comb[i], CE_UTF8));
        INTEGER(r_len)[i]  = res_len[i];
        INTEGER(r_freq)[i] = res_freq[i];
    }

    SET_VECTOR_ELT(result, 0, r_comb);
    SET_VECTOR_ELT(result, 1, r_len);
    SET_VECTOR_ELT(result, 2, r_freq);

    SEXP names = PROTECT(allocVector(STRSXP, 3));
    SET_STRING_ELT(names, 0, mkChar("combination"));
    SET_STRING_ELT(names, 1, mkChar("length"));
    SET_STRING_ELT(names, 2, mkChar("frequency"));
    setAttrib(result, R_NamesSymbol, names);

    UNPROTECT(5);
    return result;
}
