/*
SDX: Documentary System in XML.
Copyright (C) 2000, 2001, 2002  Ministere de la culture et de la communication (France), AJLSM

Ministere de la culture et de la communication,
Mission de la recherche et de la technologie
3 rue de Valois, 75042 Paris Cedex 01 (France)
mrt@culture.fr, michel.bottin@culture.fr

AJLSM, 17, rue Vital Carles, 33000 Bordeaux (France)
sevigny@ajlsm.com

This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the
Free Software Foundation, Inc.
59 Temple Place - Suite 330, Boston, MA  02111-1307, USA
or connect to:
http://www.fsf.org/copyleft/gpl.html
*/
package fr.gouv.culture.sdx.search.lucene.analysis;

import java.io.Reader;
import java.util.Set;

import org.apache.avalon.framework.configuration.Configuration;
import org.apache.avalon.framework.configuration.ConfigurationException;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;

import fr.gouv.culture.sdx.exception.SDXException;
import fr.gouv.culture.sdx.search.lucene.analysis.tokenizer.LaxistLowerCaseTokenizer;

/**
 * A default Lucene analyzer used by SDX.
 */
public class DefaultAnalyzer extends AbstractAnalyzer {

	/**
	 * @see fr.gouv.culture.sdx.search.lucene.analysis.AbstractAnalyzer#getAnalyserType()
	 */
	protected String getAnalyzerType() {
		return DefaultAnalyzer.ANALYZER_TYPE;
	}
    /** The element enclosing the list of stop words. */
    private static final String STOP_WORDS_ELEMENT = "stopWords";

    /** The element containing a stop word. */
    private static final String STOP_WORD_ELEMENT = "stopWord";

    /** The attribute indicating the use of stop words or not. */
    protected static final String ATTRIBUTE_USE_STOP_WORDS = "useStopWords";

    /** The attribute indicating the use of exclusion stem words or not. */
    protected static final String ATTRIBUTE_EXCLUDE_STEMS = "excludeStems";

    protected final static String ANALYZER_TYPE="DefaultAnalyzer";

    /** The list of stop words used. */
    // MAJ Lucene 2.1.0
    //protected Hashtable stopTable = null;
    protected Set stopTable = null;

    /**The table for stemming exclusions*/
    //MAJ Lucene 2.1.0
    //protected Hashtable excludeTable = null;
    protected Set excludeTable = null;
    /**String representation of the element name in the analyzer config file*/
    protected final String EXCLUDE_STEMS_ELEMENT = "excludeStems";
    /**String representation of the element name in the analyzer config file*/
    protected final String EXCLUDE_STEM_ELEMENT = "excludeStem";

    /** An array containing some common English words that are not usually useful for searching. */
    public static final String[] DEFAULT_STOP_WORDS = {
        "a", "and", "are", "as", "at", "be", "but", "by",
        "for", "if", "in", "into", "is", "it",
        "no", "not", "of", "on", "or", "s", "such",
        "t", "that", "the", "their", "then", "there", "these",
        "they", "this", "to", "was", "will", "with"
    };

    /**
     * Builds a default analyzer.
     *
     * <p>
     * This analyzer will use Lucene's StopAnalyzer.
     */
    public DefaultAnalyzer() {
    }

    /**
     * Configures this analyzer.
     *
     * <p>
     * The class will search for &lt;stopWord&gt; elements and
     * use them as a stop word list. If none is found or the configuration
     * object is null, the default list wi be used.
     * <p>
     * If the top-level element &lt;cconfiguration&gt; has a false
     * value for its useStopWords attribute, no stop words will
     * be used.
     */
    public void configure(Configuration configuration) throws ConfigurationException {
        if (configuration != null) {
            boolean useStopWords = configuration.getAttributeAsBoolean(ATTRIBUTE_USE_STOP_WORDS, true);
            if (useStopWords) {
                try {
                    stopTable = buildStopTable(configuration);
                } catch (SDXException e) {
                    throw new ConfigurationException(e.getMessage(), e.fillInStackTrace());
                }
            } else
                stopTable = null;

            boolean excludeStems = configuration.getAttributeAsBoolean(ATTRIBUTE_EXCLUDE_STEMS, true);

            if (excludeStems) {
                //configuration a table of exclusions
                excludeTable = buildExcludeTable(configuration);
            } else
                excludeTable = null;

        } else {
            // No configuration, so we will use default stop words
        	// MAJ Lucene 2.1.0
//            stopTable = StopFilter.makeStopTable(getDefaultStopWords());
            stopTable = StopFilter.makeStopSet(getDefaultStopWords());
            //TODO same thing for the excludeTable
            //excludeTable = StopFilter.makeStopTable(getDefaultExludedStems());

        }
    }

    /** Filters LowerCaseTokenizer with StopFilter. */
    public TokenStream tokenStream(String fieldName, Reader reader) {
        if (stopTable != null)
            return new StopFilter(new LaxistLowerCaseTokenizer(reader), stopTable);
        else
            return new LaxistLowerCaseTokenizer(reader);
    }

    /**
     * Builds a stop word table from a configuration.
     *
     * @param   conf  The configuration to use.
     * @return Set
     */
    // MAJ Lucene 2.1.0
    //protected Hashtable buildStopTable(Configuration conf) throws SDXException, ConfigurationException {
    protected Set buildStopTable(Configuration conf) throws SDXException, ConfigurationException {
    	// MAJ Lucene 2.1.0
        //Hashtable stops = null;
    	Set stops = null;
        if (conf != null) {
            Configuration cStop = conf.getChild(STOP_WORDS_ELEMENT, false);
            if (cStop != null) {
                Configuration[] cStops = cStop.getChildren(STOP_WORD_ELEMENT);
                if (cStops != null) {
                    String[] words = new String[cStops.length];
                    for (int i = 0; i < cStops.length; i++) {
                        words[i] = cStops[i].getValue();
                    }
                    // MAJ Lucene 2.1.0
                    //stops = StopFilter.makeStopTable(words);
                    stops = StopFilter.makeStopSet(words);
                }
            }
        }
        if (stops == null)
        	// MAJ Lucene 2.1.0
            //stops = StopFilter.makeStopTable(getDefaultStopWords());
        	stops = StopFilter.makeStopSet(getDefaultStopWords());

        return stops;
    }

    /**
     * Returns a default list of stop words.
     */
    protected String[] getDefaultStopWords() {
        return DEFAULT_STOP_WORDS;
    }

    /**
     * Builds a stop word table from a configuration.
     *
     * @param   conf  The configuration to use.
     * @return Set
     */
    // MAJ Lucene 2.1.0
    //protected Hashtable buildExcludeTable(Configuration conf) throws ConfigurationException {
    protected Set buildExcludeTable(Configuration conf) throws ConfigurationException {
    	// MAJ Lucene 2.1.0
        //Hashtable excludes = null;
    	Set excludes = null;
        if (conf != null) {
            Configuration cExclude = conf.getChild(EXCLUDE_STEMS_ELEMENT, false);
            if (cExclude != null) {
                Configuration[] cExcludes = cExclude.getChildren(EXCLUDE_STEM_ELEMENT);
                if (cExcludes != null) {
                    String[] words = new String[cExcludes.length];
                    for (int i = 0; i < cExcludes.length; i++) {
                        words[i] = cExcludes[i].getValue();
                    }
                    // MAJ Lucene 2.1.0
                    //excludes = StopFilter.makeStopTable(words);
                    excludes = StopFilter.makeStopSet(words);
                }
            }
        }
        if (excludes == null)
        	// MAJ Lucene 2.1.0
            //excludes = StopFilter.makeStopTable(getDefaultStopWords());
        	excludes = StopFilter.makeStopSet(getDefaultStopWords());

        return excludes;
    }
    
    /** Creates a TokenStream which tokenizes all the text in the provided
	 *  Reader. Provided for backward compatibility only.
	 * @deprecated use tokenStream(String, Reader) instead. 
	 * @see fr.gouv.culture.sdx.search.lucene.analysis.Analyzer#tokenStream(java.io.Reader)
	 * @author Malo Pichot, 2007
	 */
	public TokenStream tokenStream(Reader reader) {
		return tokenStream(null, reader);
	}

}
