/* kw.c - Keyword functions.
   Copyright (C) 2000 Free Software Foundation, Inc.

   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License
   as published by the Free Software Foundation; either version 2
   of the License, or (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
   02111-1307, USA.  */

/* Written by Marc Tardif <intmktg@cam.org>.  */

/* The context algorithm in this program can be summarised with the
   following illustration:

   str             fpos                   max bpos
    v               v                       v v
    +------------------------------------------------------------+
                               opt.context

   The str address points to the beginning of the memory location
   where the context for an identifier is manipulated.  The size of
   this segment is kept in the opt.context global variable which is
   usually set to DEFAULT_CONTEXT.  The actual memory allocated is
   increased by one byte to account for padding as described below.

   When an identifier is found in the input, the context preceding
   the current position is copied backwards in the str buffer.  The
   starting position is set to str + opt.context, where characters
   are copied from the input buffer until a paragraph delimiter is
   reached or the beginning of the str buffer.  The resulting
   offset from str is stored in the bpos variable.

   The maximum forward context is then set according to the number
   of characters read backwards.  The object is to retain as much
   context as possible while keeping the identifier as centered as
   possible.  Therefore, max is set to opt.context / 2 or bpos
   depending on which is greater.  Characters are then copied from
   the input buffer starting at str until a paragraph delimiter is
   reached or max.  If the current buffer is exhausted prematurely,
   new data is read into the input buffer and the remaining context
   is processed.  If the last copied character is a space, fpos is
   set to the previous offset, otherwise it is set to the current
   offset.  This makes sure opt.context relevant bytes are copied
   which explains the need for a padding byte.

   After fpos has been determined, the value of bpos must be re-
   evaluated.  If fpos is greater than bpos, the latter is set to
   fpos + 1, otherwise it remains unchanged.  The actual context
   then consists of two fragments:
   - The first half is contained between str + bpos and str +
     opt.context.
   - The second half is located at the beginning of the str
     buffer and extends for fpos bytes.
*/

#include <stdlib.h>
#include <string.h>
#include <stdio.h>

#include "ac.h"
#include "kw.h"
#include "mem.h"
#include "options.h"

#include "text.h"      /* TEXT functions.  */
#include "html.h"      /* HTML functions.  */

#define is_space(c) ((c)==0x09 || (c)==0x0A || (c)==0x0B || (c)==0x0C \
                     || (c)==0x0D || (c)==0x20)

enum state
{
  SRCH = -2, /* search buffer                   */
  SAVE       /* save buffer                     */
             /* this value should be zero       */
};

enum token
{
  TERM = -8, /* term in expression              */
  OPEN,      /* open parenthesis                */
  CLOSE,     /* close parenthesis               */
  AND,       /* boolean operator                */
  NEAR,      /* boolean operator                */
  NOT,       /* boolean operator                */
  OR,        /* boolean operator                */
  END        /* end of buffer                   */
             /* this value should be zero       */
};

struct kw *kw;

static unsigned char * (*goforward) (char *, char *);
static int (*gofind) (char **, char **, char *);

/* Recursive functions for building syntax tree.  */
static struct tree *expr (char **, int *);
static struct tree *parse (char **, int *);

static struct tree *
gettoken (char **str)
{
  register unsigned char c;
  register char *beg, *end;
  struct tree *tree;
  int val;

  beg = *str;
  while (is_space (*beg))
    beg++;

  end = beg;
  switch (*end)
    {
    case '\0':
      return NULL;
    case '\"':
      end = ++beg;
      while ((c = *end++) != '\0')
        if (*end == '\"' && c != '\\')
          break;
      if (*end != '\"')
        return NULL;
      *str = end + 1;
      break;
    case '\'':
      end = ++beg;
      while ((c = *end++) != '\0')
        if (*end == '\'' && c != '\\')
          break;
      if (*end != '\'')
        return NULL;
      *str = end + 1;
      break;
    default:
      do
        c = *end++;
      while ((c == '\\' || !is_space (*end))
             && *end != ')' && *end != '\0');
      *str = end;
      break;
    }

  val = ac_incr (beg, end, kw->num) - 1;
  if (val == kw->num)
    kw->num++;
  else if (!val)
    return NULL;

  tree = mem_alloc (sizeof (struct tree));
  if (!tree)
    return NULL;

  tree->left = beg;
  tree->right = end;
  tree->value = val;

  return tree;
}

static enum token
lookahead (char **str)
{
  while (is_space (**str))
    (*str)++;
  switch (**str)
    {
    case '\0':
      return END;
    case '(':
      (*str)++;
      return OPEN;
    case ')':
      (*str)++;
      return CLOSE;
    case 'a':
    case 'A':
      if (((*str)[1]=='n' || (*str)[1]=='N')
          && ((*str)[2]=='d' || (*str)[2]=='D')
          && (is_space ((*str)[3]) || (*str)[3]=='\0'))
        {
          *str += 3;
          return AND;
        }
      break;
    case 'n':
    case 'N':
      switch ((*str)[1]) {
      case 'o':
      case 'O':
        if (((*str)[2]=='t' || (*str)[2]=='T')
            && (is_space ((*str)[3]) || (*str)[3]=='\0'))
          {
            *str += 3;
            return NOT;
          }
        break;
      case 'e':
      case 'E':
        if (((*str)[2]=='a' || (*str)[2]=='A')
            && ((*str)[3]=='r' || (*str)[3]=='R')
            && (is_space ((*str)[4]) || (*str)[4]=='\0'))
          {
            *str += 4;
            return NEAR;
          }
        break;
      }
      break;
    case 'o':
    case 'O':
      if (((*str)[1]=='r' || (*str)[1]=='R')
          && (is_space ((*str)[2]) || (*str)[2]=='\0'))
        {
          *str += 2;
          return OR;
        }
      break;
    }

  return TERM;
}

static struct tree *
treedup (struct tree *tree)
{
  if (tree->value < 0)
    {
      struct tree *ptr;

      ptr = mem_alloc (sizeof (struct tree));
      if (!ptr)
        return NULL;

      ptr->left = treedup (tree->left);
      ptr->right = treedup (tree->right);
      ptr->value = tree->value;

      return ptr;
    }
  else
    return tree;
}

static struct tree *
treenorm (struct tree *l, struct tree *r)
{
  struct tree *tree;

  if (r->value < 0)
    {
      tree = treedup (l);
      if (!tree)
        return NULL;

      r->left = treenorm (l, r->left);
      r->right = treenorm (tree, r->right);
      if (!r->left || !r->right)
        return NULL;

      return r;
    }
  else if (l->value == NEAR)
    {
      tree = mem_alloc (sizeof (struct tree));
      if (!tree)
        return NULL;

      tree->left = l;
      tree->value = AND;
      tree->right = treenorm (l->right, r);
      if (!tree->right)
        return NULL;

      return tree;
    }
  else if (l->value < 0)
    {
      l->left = treenorm (l->left, r);
      l->right = treenorm (l->right, r);
      if (!l->left || !l->right)
        return NULL;

      return l;
    }
  else
    {
      tree = mem_alloc (sizeof (struct tree));
      if (!tree)
        return NULL;

      tree->left = l;
      tree->right = r;
      tree->value = NEAR;
      return tree;
    }
}

static struct tree *
expr (char **str, enum token *tok)
{
  struct tree *tree;

  if (*tok == OPEN)
    {
      tree = parse (str, tok);
      if (*tok == END)
        return NULL;
    }
  else if (*tok == TERM)
    tree = gettoken (str);
  else
    return NULL;

  *tok = lookahead (str);
  return tree;
}

static struct tree *
parse (char **str, enum token *tok)
{
  struct tree *left, *tree;

  *tok = lookahead (str);
  left = expr (str, tok);
  if (!left)
    return NULL;

  for (;;)
    switch (*tok)
      {
      case AND:
      case NOT:
      case OR:
        tree = mem_alloc (sizeof (struct tree));
        if (!tree)
          return NULL;

        tree->value = *tok;
        *tok = lookahead (str);

        tree->left = left;
        tree->right = expr (str, tok);
        if (!tree->right)
          return NULL;

        left = tree;
        break;
      case NEAR:
        *tok = lookahead (str);
        tree = expr (str, tok);
        if (!tree)
          return NULL;

        left = treenorm (left, tree);
        if (!left)
          return NULL;

        break;
      case TERM:
      case OPEN:
        return NULL;
      case END:
      case CLOSE:
        return left;

      default:
        abort ();
      }

  return NULL;
}

int
arrayfill (struct tree *tree)
{
  if (tree->value < 0)
    {
      arrayfill (tree->left);
      arrayfill (tree->right);
    }
  else
    {
      register int i;
      struct pat *pat;

      pat = &kw->pat[tree->value];
      for (i=0; i < kw->num; i++)
        pat->distance[i] = 0;
      pat->position = 0;
      pat->count = 0;
      pat->str = tree->left;
      pat->len = (char *)tree->right - (char *)tree->left;

      if (!opt.out_quiet && opt.occurences)
        {
          register int i, context;
          char **line;

          context = ALIGN (opt.context + 1);
          line = mem_alloc ((opt.occurences * sizeof (char *))
                            + (opt.occurences * context));
          if (!line)
            return 0;
          line[0] = (char *)(line + opt.occurences);
          line[0][0] = '\0';
          for (i = 1; i < opt.occurences; i++)
            {
              line[i] = line[0] + i * context;
              line[i][0] = '\0';
            }
          pat->line = line;
        }
    }
  return 1;
}

int
kw_init (char *str)
{
  register int i, j;

  kw = mem_init (sizeof (struct kw));
  if (!kw)
    return 0;

  if (!ac_init (opt.ignore_case))
    return 0;

  if (opt.fixed_string)
    {
      int len;

      kw->tree = mem_alloc (sizeof (struct tree));
      if (!kw->tree)
        return 0;

      len = strlen (str);
      kw->tree->value = 0;
      kw->tree->left = str;
      kw->tree->right = str + len;

      kw->num = ac_incr (kw->tree->left, kw->tree->right, 0);
      if (!kw->num)
        return 0;
    }
  else
    {
      enum token tok;

      kw->num = 0;
      kw->tree = parse (&str, &tok);
      if (!kw->tree)
        return -1;
    }

  kw->str = mem_alloc (ALIGN (opt.context + 1)
                       + kw->num * (ALIGN (sizeof (struct pat))
                                    + sizeof (int) * (kw->num)));
  if (!kw->str)
    return 0;
  kw->pat = (struct pat *)(kw->str + ALIGN (opt.context + 1));
  kw->pat[0].distance = (int *)(kw->pat + kw->num
                                * ALIGN (sizeof (struct pat)));
  for (i = 1; i < kw->num; i++)
    {
      kw->pat[i].position = 0;
      kw->pat[i].distance = (int *)(kw->pat[0].distance + i * kw->num);
      for (j = 0; j < kw->num; j++)
        kw->pat[i].distance[j] = 0;
    }

  if (!arrayfill (kw->tree)
      || !ac_prep ())
    return 0;

  return kw->num;
}

void
kw_prep (enum extension type)
{
  kw->fpos = 0;
  kw->state = SRCH;
  kw->context_table = kw->cstack;
  *kw->context_table = NULL;
  kw->pattern_table = kw->pstack;

  if (type == HTML)
    {
      *kw->pattern_table = html_first;
      goforward = html_forward;
      gofind = html_find;
    }
  else
    {
      *kw->pattern_table = text_first;
      goforward = text_forward;
      gofind = text_find;
    }
}

int
kw_exec (char **pos, char **buf, char *lim)
{
  register int i, state;
  struct pat *pat;
  char *beg;

  state = kw->state;
  beg = *pos;

  for (;;)
    {
      switch (state)
        {
        case SRCH:
          state = gofind (pos, buf, lim);
          break;
        case SAVE:
          if (!opt.out_quiet)
            beg = text_backward (*pos);
          state = SRCH;
          goto fin;

        /* pattern found */
        default:
          pat = &kw->pat[state];
          if (!opt.out_quiet
              && (!opt.occurences || pat->count < opt.occurences))
            {
              if (*kw->context_table)
                goforward (*buf, lim);
              else
                {
                  text_backward (*pos);
                  if (goforward (*buf, lim))
                    goto fin;
                }

              kw->bpos = kw->bpos > kw->fpos + 1
                         ? kw->bpos : kw->fpos + 1;
              if (kw->str[kw->bpos] == ' ')
                kw->bpos++;

              if (!opt.occurences)
                {
                  if (opt.with_filename)
                    printf ("%s:", opt.filename);
                  if (opt.with_keyword)
                    printf ("%.*s:", pat->len, pat->str);
                  printf ("%.*s%.*s\n",
                          (int)opt.context - kw->bpos + 1, kw->str + kw->bpos,
                          kw->fpos, kw->str);
                }
              else
                {
                  memcpy (pat->line[pat->count],
                          kw->str + kw->bpos,
                          opt.context - kw->bpos + 1);
                  memcpy (pat->line[pat->count] + opt.context - kw->bpos + 1,
                          kw->str,
                          kw->fpos);
                  pat->line[pat->count][opt.context - kw->bpos
                                        + kw->fpos + 1] = '\0';
                }
              *kw->context_table = NULL;
              kw->fpos = 0;
            }

          for (i=0; i<kw->num; i++)
            if (kw->pat[i].position
                && (!pat->distance[i]
                    || pat->distance[i] > kw->words - kw->pat[i].position))
              pat->distance[i] = kw->words - kw->pat[i].position;
          pat->position = kw->words;
          pat->count++;
          state = SRCH;
          break;
        }
    }

 fin:
  kw->state = state;
  return *pos - beg;
}

static int
eval (struct tree *tree)
{
  struct tree *left, *right;

  switch (tree->value)
    {
    case AND:
      return eval (tree->left) &&  eval (tree->right);
    case NEAR:
      left = tree->left;
      right = tree->right;
      if ((kw->pat[left->value].distance[right->value]
            && kw->pat[left->value].distance[right->value] <= opt.distance)
          || (kw->pat[right->value].distance[left->value]
            && kw->pat[right->value].distance[left->value] <= opt.distance))
        return 1;
      return 0;
    case NOT:
      return eval (tree->left) && !eval (tree->right);
    case OR:
      return eval (tree->left) ||  eval (tree->right);
    default:
      return kw->pat[tree->value].count;
    }
}

int
kw_print (void)
{
  register int i, j;
  struct pat *pat;

  if (!eval (kw->tree)) {
    if (opt.list_files == 1)
      printf ("%s\n", opt.filename);
    for (i = 0; i < kw->num; i++)
      {
        pat = &kw->pat[i];
        for (j=0; j < kw->num; j++)
          pat->distance[j] = 0;
        pat->position = 0;
        pat->count = 0;
      }
    return 0;
  }

  if (opt.out_quiet)
    {
      if (opt.count_matches)
        for (i = 0; i < kw->num; i++)
          {
            pat = &kw->pat[i];
            if (opt.with_filename)
              printf ("%s:", opt.filename);
            if (opt.with_keyword)
              printf ("%.*s:", pat->len, pat->str);
            printf ("%d\n", pat->count);

            for (j=0; j < kw->num; j++)
              pat->distance[j] = 0;
            pat->position = 0;
            pat->count = 0;
          }
      else if (opt.list_files == 2)
        printf ("%s\n", opt.filename);
    }
  else
    {
      if (opt.occurences)
        {
          for (i = 0; i < kw->num; i++)
            {
              pat = &kw->pat[i];
              if (pat->count)
                {
                  for (j = 0; j < opt.occurences && pat->line[j][0]; j++)
                    {
                      if (opt.with_filename)
                        printf ("%s:", opt.filename);
                      if (opt.with_keyword)
                        printf ("%.*s:", pat->len, pat->str);
                      printf ("%s\n", pat->line[j]);
                      pat->line[j][0] = '\0';
                    }

                  for (j=0; j < kw->num; j++)
                    pat->distance[j] = 0;
                  pat->position = 0;
                  pat->count = 0;
                }
            }
        }
    }

  return 1;
}

void
kw_free (void)
{
  mem_free ();
}

