/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
/*
 * Pan - A Newsreader for Gtk+
 * Copyright (C) 2002  Charles Kerr <charles@rebelbase.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; version 2 of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

#include <config.h>

#include <ctype.h>
#include <string.h>

#include <glib.h>

#include <pan/base/debug.h>
#include <pan/base/log.h>
#include <pan/base/pan-glib-extensions.h>
#include <pan/base/pan-i18n.h>

#include <pan/filters/filter-phrase.h>

const char * FILTER_PHRASE_CLASS_ID = "PanObject::Filter::FilterPhrase";

/************
*************  
************/

#define is_metacharacter(A) (_metacharacters[(guchar)(A)])
#if 0
static char _metacharacters[UCHAR_MAX];
#define PRINT_TABLE(A) \
	printf ("static char " #A "[UCHAR_MAX] = {"); \
	for (i=0; i<UCHAR_MAX; ++i) { \
		if (!(i%40)) \
			printf ("\n\t"); \
		printf ("%d,", A[i]); \
	} \
	printf ("};\n\n");
static void
build_table (void)
{
	int i;
	unsigned char ch;

	for (ch=0; ch<UCHAR_MAX; ++ch) {
		_metacharacters[ch] = ch=='.' || ch=='^' || ch=='$' || ch=='*' ||
		                      ch=='+' || ch=='?' || ch=='{' || ch=='[' ||
				      ch=='|' || ch=='(' || ch==')' || ch=='\\';
	}

	PRINT_TABLE(_metacharacters)
}
#else
static char _metacharacters[UCHAR_MAX] = {
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,
	1,1,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
#endif


/************
*************  PROTECTED
************/

static Filter *
filter_phrase_dup (const Filter * f_old)
{
	Filter * f_new = filter_phrase_new ();
	FilterPhrase * fp_old = FILTER_PHRASE(f_old);
	FilterPhrase * fp_new = FILTER_PHRASE(f_new);
	filter_class_dup (f_old, f_new);
	filter_phrase_set (fp_new, fp_old->public_match_type, fp_old->key_type, fp_old->public_key, fp_old->case_sensitive);
	return f_new;
}

static char*
filter_phrase_to_string (const Filter * filter)
{
	const char * key_str = "";
	const char * match_str = "";
	const gboolean negate = filter->negate;
	const FilterPhrase * phrase = FILTER_PHRASE(filter);

	switch (phrase->key_type) {
		case PHRASE_KEY_SUBJECT:     key_str = _("Subject"); break;
		case PHRASE_KEY_AUTHOR:      key_str = _("Author"); break;
		case PHRASE_KEY_MESSAGE_ID:  key_str = _("Message-ID"); break;
		case PHRASE_KEY_REFERENCES:  key_str = _("References"); break;
		case PHRASE_KEY_XREF:        key_str = _("Xref"); break;
		default:                     g_warning ("Unrecognized key_type %d", phrase->key_type);
	}

	if (!negate) {
		switch (phrase->public_match_type) {
			case PHRASE_MATCH_CONTAINS:     match_str = _("contains"); break;
			case PHRASE_MATCH_IS:           match_str = _("is"); break;
			case PHRASE_MATCH_STARTS_WITH:  match_str = _("starts with"); break;
			case PHRASE_MATCH_ENDS_WITH:    match_str = _("ends with"); break;
			case PHRASE_MATCH_REGEX:        match_str = _("matches regular expression"); break;
		        default:                        g_warning ("Unrecognized match_type %d", phrase->public_match_type);
		}
	} else {
		switch (phrase->public_match_type) {
			case PHRASE_MATCH_CONTAINS:     match_str = _("does not contain"); break;
			case PHRASE_MATCH_IS:           match_str = _("is not"); break;
			case PHRASE_MATCH_STARTS_WITH:  match_str = _("does not start with"); break;
			case PHRASE_MATCH_ENDS_WITH:    match_str = _("does not end with"); break;
			case PHRASE_MATCH_REGEX:        match_str = _("does not match regular expression"); break;
		        default:                        g_warning ("Unrecognized match_type %d", phrase->public_match_type);
		}
	}

	return g_strdup_printf ("%s %s \"%s\"", key_str, match_str, phrase->public_key);
}

static gboolean
my_regcomp (regex_t * preg, const char * key, gboolean case_sensitive)
{
	int errcode;
	gboolean retval;

	int flags = REG_EXTENDED;
	if (!case_sensitive)
		flags |= REG_ICASE;

	errcode = regcomp (preg, key, flags);

	if (!errcode)
		retval = TRUE;
	else
	{
		char buf[2048];
		regerror (errcode, preg, buf, sizeof(buf));
		log_add_va (LOG_ERROR, _("Can't use regular expression \"%s\": %s"), key, buf);

		retval = FALSE;
	}

	return retval;
}


static int
my_regexec (FilterPhrase * filter, const char * text_str)
{
	int retval;

	if (filter->regex_state == REGEX_NEED_COMPILE)
	{
		if (my_regcomp (&filter->key_regex, filter->private_key, filter->case_sensitive))
			filter->regex_state = REGEX_COMPILED;
		else
			filter->regex_state = REGEX_ERR;
	}

	if (filter->regex_state == REGEX_COMPILED)
		retval = regexec (&filter->key_regex, text_str, 0, NULL, 0);
	else
		retval = REG_NOMATCH;

	return retval;
}

/**
 * Boyer-Moore-Horspool-Sunday search algorithm.
 * Returns position of match, or -1 if no match.
 */
static int
bmhs_isearch (const unsigned char * text, int text_len,
              const unsigned char * pat, int pat_len,
              const char * skip)

{
	const guchar first_uc = toupper(*pat);
	const guchar first_lc = tolower(*pat);
	const guchar * t = text;
	const guchar * text_end = text + text_len - pat_len + 1;
	const guchar * pat_end = pat + pat_len; 
	const guchar * p;
	const guchar * q;

	for (;;)
	{
		/* scan loop that searches for the first character of the pattern */
		while (t<text_end && *t!=first_uc && *t!=first_lc)
			t += skip[tolower(t[pat_len])];
		if (t >= text_end)
			break;

		/* first character matches, so execute match loop in fwd direction */
		p = pat;
		q = t;
		while (++p < pat_end && *p == tolower(*++q))
			;

		if (p == pat_end)
			return t - text;

		t += skip[t[pat_len]];
	}

	return -1;
}

/**
 * Boyer-Moore-Horspool-Sunday search algorithm.
 * Returns position of match, or -1 if no match.
 */
static int
bmhs_search (const unsigned char * text, int text_len,
             const unsigned char * pat, int pat_len,
             const char * skip)

{
	const guchar first = *pat;
	const guchar * t = text;
	const guchar * text_end = text + text_len - pat_len + 1;
	const guchar * pat_end = pat + pat_len; 
	const guchar * p;
	const guchar * q;

	for (;;)
	{
		/* scan loop that searches for the first character of the pattern */
		while (t<text_end && *t!=first)
			t += skip[t[pat_len]];
		if (t >= text_end)
			break;

		/* first character matches, so execute match loop in fwd direction */
		p = pat;
		q = t;
		while (++p < pat_end && *p == *++q)
			;

		if (p == pat_end)
			return t - text;

		t += skip[t[pat_len]];
	}

	return -1;
}

static void
filter_phrase_test_articles (Filter          * filter,
                             const Article  ** articles,
                             int               article_qty,
                             gboolean        * does_match)
{
	int i;
	FilterPhrase * phrase = FILTER_PHRASE(filter);

	for (i=0; i<article_qty; ++i)
	{
		char buf[512];
		const Article * a = articles[i];
		const char * text = NULL;
		int text_len;

		/* tweak: some newsreaders trim out the leading message-ids from long references headers,
		 * which breaks threading and References filtering.  The common use for looking at References;
		 * is for identifying threads, i.e., watch/ignore thread, so for the special case of
		 * "References STARTS_WITH" we'll walk up Pan's own threading before applying the test. */
		if (phrase->key_type==PHRASE_KEY_REFERENCES && phrase->match_type == PHRASE_MATCH_STARTS_WITH)
			while (a->parent!=NULL && pstring_is_set(&a->parent->references))
				a = a->parent;

		switch (phrase->key_type)
		{
			case PHRASE_KEY_SUBJECT:
				text = a->subject.str;
				text_len = a->subject.len;
				break;

			case PHRASE_KEY_AUTHOR:
				text = article_get_author_str (a, buf, sizeof(buf));
				text_len = strlen (text);
				break;

			case PHRASE_KEY_MESSAGE_ID:
				text = a->message_id.str;
				text_len = a->message_id.len;
				break;

			case PHRASE_KEY_REFERENCES:
				text = a->references.str;
				text_len = a->references.len;
				break;

			case PHRASE_KEY_XREF:
				text = a->xref.str;
				text_len = a->xref.len;
				break;
		}

		if (!is_nonempty_string (text))
		{
			does_match[i] = FALSE;
		}
		else
		{
			const char * pat = phrase->private_key;
			const int pat_len = phrase->private_key_len;

			switch (phrase->match_type)
			{
				case PHRASE_MATCH_REGEX:
					does_match[i] = !my_regexec (phrase, text);
					break;

				case PHRASE_MATCH_ENDS_WITH:
					if (text_len < pat_len) {
						does_match[i] = FALSE;
						break;
					}
					text += text_len - pat_len;
					text_len -= pat_len;
					/* fall through to "is" */

				case PHRASE_MATCH_IS:
					if (phrase->case_sensitive)
						does_match[i] = !pan_strcmp (pat, text);
					else
						does_match[i] = !g_ascii_strcasecmp (pat, text);
					break;

				case PHRASE_MATCH_STARTS_WITH:
					if (phrase->case_sensitive)
						does_match[i] = !strncmp (text, pat, pat_len);
					else
						does_match[i] = !g_ascii_strncasecmp (text, pat, pat_len);
					break;

				case PHRASE_MATCH_CONTAINS:
					if (phrase->case_sensitive)
						does_match[i] = bmhs_search ((const guchar*)text, text_len, phrase->private_key, phrase->private_key_len, phrase->bmhs_skip) != -1;
					else
						does_match[i] = bmhs_isearch ((const guchar*)text, text_len, phrase->private_key, phrase->private_key_len, phrase->bmhs_skip) != -1;
					break;
			}
		}
	}
}

static void
filter_phrase_destructor (PanObject * o)
{
	filter_phrase_set (FILTER_PHRASE(o), PHRASE_MATCH_IS, PHRASE_KEY_SUBJECT, NULL, FALSE);
	filter_destructor (o);
}

static void
filter_phrase_constructor (FilterPhrase * f)
{
	debug_enter ("filter_phase_constructor");

	filter_constructor ((Filter*)f,
	                    filter_phrase_destructor,
	                    filter_phrase_test_articles,
	                    filter_phrase_to_string,
	                    filter_phrase_dup,
	                    FILTER_PHRASE_CLASS_ID);

	f->public_match_type = PHRASE_MATCH_IS;
	f->match_type = PHRASE_MATCH_IS;
	f->key_type = PHRASE_KEY_SUBJECT;
	f->private_key_len = 0;
	f->private_key = NULL;
	f->public_key = NULL;
	f->case_sensitive = FALSE;
	f->regex_state = REGEX_NEED_COMPILE;

	debug_exit ("filter_phase_constructor");
}

static char*
regexp_unescape (const char * in)
{
	char * retval = g_new (char, strlen(in)+1);
	char * out = retval;
	if (*in == '^')
		++in;
	while (*in) {
		if (in[0]=='\\' && is_metacharacter(in[1]))
			++in;
		*out++ = *in++;
	}
	if ((out-retval>1) && in[-1]=='$' && in[-2]!='\\')
		--out;
	*out = '\0';
	return retval;
}

/**
 * Try to downgrade the more-expensive regexes to a cheaper type.
 */
static PhraseMatchType
get_real_match_type (const char * key, PhraseMatchType type)
{
	gboolean starts_with = FALSE;
	gboolean ends_with = FALSE;
	const char * pch;
	const char * end;
	size_t len;

	/* sanity clause */
	g_return_val_if_fail (is_nonempty_string(key), type);

	/* if it's not a regex, keep it */
	if (type != PHRASE_MATCH_REGEX)
		return type;

	/* must it be a regex? */
	len = strlen (key);
	end = key + len;
	for (pch=key; pch!=end; ++pch)
		if (*pch=='\\' && is_metacharacter(pch[1]))
			++pch;
		else if (*pch=='^' && pch==key)
			starts_with = TRUE;
		else if (*pch=='$' && pch+1==end)
			ends_with = TRUE;
		else if (is_metacharacter(*pch))
			return PHRASE_MATCH_REGEX;

	if (starts_with && ends_with)
		return PHRASE_MATCH_IS;
	if (starts_with)
		return PHRASE_MATCH_STARTS_WITH;
	if (ends_with)
		return PHRASE_MATCH_ENDS_WITH;
	return PHRASE_MATCH_CONTAINS;
}

/************
*************  PUBLIC
************/

Filter*
filter_phrase_new (void)
{
	FilterPhrase * f;
	debug_enter ("filter_phrase_new");

	f = g_new0 (FilterPhrase, 1);
	filter_phrase_constructor (f);

	debug_exit ("filter_phrase_new");
	return FILTER(f);
}

char*
filter_phrase_validate_regex   (const char * regex_str)
{
	char* retval = NULL;
	const int flags = REG_EXTENDED;
	int errcode;
	regex_t reg;
	
	errcode = regcomp (&reg, regex_str, flags);

	if (!errcode)
		regfree (&reg);
	else {
		char buf[2048];
		regerror (errcode, &reg, buf, sizeof(buf));
		retval = g_strdup (buf);
	}

	return retval;
}

void
filter_phrase_set (FilterPhrase        * filter,
                   PhraseMatchType       match_type,
                   PhraseKeyType         key_type,
                   const char          * key,
                   gboolean              case_sensitive)
{
	debug_enter ("filter_phrase_set");

	/* sanity clause */
	g_return_if_fail (filter!=NULL);

	/* free the previous phrase */
	replace_gstr (&filter->public_key, NULL);
	replace_gstr (&filter->private_key, NULL);
	if (filter->regex_state == REGEX_COMPILED) {
		regfree (&filter->key_regex);
		filter->regex_state = REGEX_NEED_COMPILE;
	}

	/* repopulate the filter */
	filter->public_match_type = match_type;
	filter->match_type = key==NULL ? match_type : get_real_match_type (key, match_type);
	filter->key_type = key_type;
	filter->case_sensitive = case_sensitive;
	filter->private_key_len = 0;
	if (key != NULL)
	{
		/* build the key strings */
		filter->public_key = g_strdup (key);
		filter->private_key = filter->match_type == filter->public_match_type
			? g_strdup (key)
			: regexp_unescape (key);
		if (!filter->case_sensitive) {
			char * pch;
			for (pch=filter->private_key; *pch; ++pch)
				*pch = tolower (*pch);
		}
		filter->private_key_len = strlen (filter->private_key);

		/* Boyer-Moore-Horspool-Sunday */
		{
			int i;
			char * skip = filter->bmhs_skip;
			const char * pat = filter->private_key;
			const int len = filter->private_key_len;

			for (i=0; i<UCHAR_MAX; ++i)
				skip[i] = len + 1;
			for (i=0; i<len; i++)
				skip[(guchar)(pat[i])] = len - i;
		}
	}

	debug_exit ("filter_phrase_set");
}

/**
***
**/

static void
quote_regexp (GString * out, const char * in)
{
	for (; in && *in; ++in) {
		if (is_metacharacter (*in))
			g_string_append_c (out, '\\');
		g_string_append_c (out, *in);
	}
}

char*
filter_phrase_create_regex   (const char           * in,
                              PhraseMatchType        match_type)
{
	GString * out = g_string_new (NULL);

	if (match_type == PHRASE_MATCH_REGEX)
		g_string_assign (out, in);
	else
		quote_regexp (out, in);

	pan_g_string_strstrip (out);

	if (out->len)
	{
		if (match_type == PHRASE_MATCH_IS || match_type == PHRASE_MATCH_STARTS_WITH)
			g_string_insert_c (out, 0, '^');

		if (match_type == PHRASE_MATCH_IS || match_type == PHRASE_MATCH_ENDS_WITH)
			g_string_append_c (out, '$');
	}

	return g_string_free (out, FALSE);
}
