analysis.FtpFilePathAnalyzer.java Source code

Introduction

Here is the source code for analysis.FtpFilePathAnalyzer.java
Source

package analysis;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements. See the NOTICE file distributed with this
 * work for additional information regarding copyright ownership. The ASF
 * licenses this file to You under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law
 * or agreed to in writing, software distributed under the License is
 * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied. See the License for the specific language
 * governing permissions and limitations under the License.
 */

import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.Iterator;
import java.util.Set;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharFilter;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.MappingCharFilter;
import org.apache.lucene.analysis.NormalizeCharMap;
import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WordlistLoader;
import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.Version;

import resource.CrawlerSetting;
import standard.StandardFilter;
import standard.StandardTokenizer;

/**
 * Filters {@link StandardTokenizer} with {@link StandardFilter},
 * {@link LowerCaseFilter} and {@link StopFilter}, using a list of English stop
 * words.
 * 
 * @version $Id: FtpFilePathAnalyzer.java,v 1.8 2009-01-09 09:27:24 rl Exp $
 */
public class FtpFilePathAnalyzer extends Analyzer {
    /**
     * Add by Weiwei Wang to correct c++ like input
     */
    public static final NormalizeCharMap RECOVERY_MAP;
    static {
        RECOVERY_MAP = new NormalizeCharMap();
        String recoveryWords = CrawlerSetting.getProperty("analyzer.ftp.recovery");
        String[] splits = recoveryWords.split(";");
        for (int i = 0; i < splits.length; i += 2) {
            RECOVERY_MAP.add(splits[i], splits[i + 1]);
        }
    }

    @SuppressWarnings("unchecked")
    protected Set stopSet;

    protected static final String STEMMER = "English";

    /**
     * Specifies whether deprecated acronyms should be replaced with HOST type.
     * This is false by default to support backward compatibility.
     * 
     * @deprecated this should be removed in the next release (3.0). See
     *             https://issues.apache.org/jira/browse/LUCENE-1068
     */
    protected boolean replaceInvalidAcronym = defaultReplaceInvalidAcronym;

    private static boolean defaultReplaceInvalidAcronym;

    // Default to true (fixed the bug), unless the system prop is set
    static {
        final String v = System
                .getProperty("org.apache.lucene.analysis.standard.StandardAnalyzer.replaceInvalidAcronym");
        if (v == null || v.equals("true"))
            defaultReplaceInvalidAcronym = true;
        else
            defaultReplaceInvalidAcronym = false;
    }

    /**
     * @return true if new instances of StandardTokenizer will replace
     *         mischaracterized acronyms See
     *         https://issues.apache.org/jira/browse/LUCENE-1068
     * @deprecated This will be removed (hardwired to true) in 3.0
     */
    public static boolean getDefaultReplaceInvalidAcronym() {
        return defaultReplaceInvalidAcronym;
    }

    /**
     * @param replaceInvalidAcronym
     *            Set to true to have new instances of StandardTokenizer replace
     *            mischaracterized acronyms by default. Set to false to preseve
     *            the previous (before 2.4) buggy behavior. Alternatively, set
     *            the system property
     *            org.apache.lucene.analysis.standard.StandardAnalyzer
     *            .replaceInvalidAcronym to false. See
     *            https://issues.apache.org/jira/browse/LUCENE-1068
     * @deprecated This will be removed (hardwired to true) in 3.0
     */
    public static void setDefaultReplaceInvalidAcronym(boolean replaceInvalidAcronym) {
        defaultReplaceInvalidAcronym = replaceInvalidAcronym;
    }

    /**
     * An array containing some common English words that are usually not useful
     * for searching.
     */
    public static final String[] STOP_WORDS;// =
    // StopAnalyzer.ENGLISH_STOP_WORDS;

    static {
        String stopwords = CrawlerSetting.getProperty("analyzer.ftp.stopwords");
        String[] splits = stopwords.split(";");
        String[] STOP = new String[splits.length + StopAnalyzer.ENGLISH_STOP_WORDS_SET.size()];
        for (int i = 0; i < splits.length; i++)
            STOP[i] = splits[i];
        Iterator<?> iterator = StopAnalyzer.ENGLISH_STOP_WORDS_SET.iterator();
        for (int i = 0; iterator.hasNext(); i++) {
            STOP[i + splits.length] = (String) iterator.next();
        }
        STOP_WORDS = STOP;
    }

    /** Builds an analyzer with the default stop words ({@link #STOP_WORDS}). */
    public FtpFilePathAnalyzer() {
        this(STOP_WORDS);
    }

    /** Builds an analyzer with the given stop words. */
    @SuppressWarnings("unchecked")
    public FtpFilePathAnalyzer(Set stopWords) {
        stopSet = stopWords;
    }

    /** Builds an analyzer with the given stop words. */
    public FtpFilePathAnalyzer(String[] stopWords) {
        stopSet = StopFilter.makeStopSet(stopWords);
    }

    /**
     * Builds an analyzer with the stop words from the given file.
     * 
     * @see WordlistLoader#getWordSet(File)
     */
    public FtpFilePathAnalyzer(File stopwords) throws IOException {
        stopSet = WordlistLoader.getWordSet(stopwords);
    }

    /**
     * Builds an analyzer with the stop words from the given reader.
     * 
     * @see WordlistLoader#getWordSet(Reader)
     */
    public FtpFilePathAnalyzer(Reader stopwords) throws IOException {
        stopSet = WordlistLoader.getWordSet(stopwords);
    }

    /**
     * Constructs a {@link StandardTokenizer} filtered by a
     * {@link StandardFilter}, a {@link LowerCaseFilter} and a
     * {@link StopFilter}.
     */
    public TokenStream tokenStream(String fieldName, Reader reader) {
        CharFilter filter = new LowercaseCharFilter(reader);
        filter = new MappingCharFilter(RECOVERY_MAP, filter);
        StandardTokenizer tokenStream = new StandardTokenizer(Version.LUCENE_30, filter);
        tokenStream.setMaxTokenLength(maxTokenLength);
        TokenStream result = new StandardFilter(tokenStream);
        //   result = new LowerCaseFilter(result);
        result = getStopFilter(result);
        result = new SnowballFilter(result, STEMMER);
        return result;
    }

    public TokenFilter getStopFilter(TokenStream in) {
        return new RosaStopFilter(in, stopSet);
    }

    static final class SavedStreams {
        StandardTokenizer tokenStream;

        TokenStream filteredTokenStream;
    }

    /** Default maximum allowed token length */
    public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;

    protected int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;

    /**
     * Set maximum allowed token length. If a token is seen that exceeds this
     * length then it is discarded. This setting only takes effect the next time
     * tokenStream or reusableTokenStream is called.
     */
    public void setMaxTokenLength(int length) {
        maxTokenLength = length;
    }

    /**
     * @see #setMaxTokenLength
     */
    public int getMaxTokenLength() {
        return maxTokenLength;
    }

    @SuppressWarnings("deprecation")
    public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
        SavedStreams streams = (SavedStreams) getPreviousTokenStream();
        if (streams == null) {
            streams = new SavedStreams();
            setPreviousTokenStream(streams);
            CharFilter filter = new LowercaseCharFilter(reader);
            filter = new MappingCharFilter(RECOVERY_MAP, filter);
            streams.tokenStream = new StandardTokenizer(Version.LUCENE_30, filter);
            streams.filteredTokenStream = new StandardFilter(streams.tokenStream);
            //       streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream);
            streams.filteredTokenStream = getStopFilter(streams.filteredTokenStream);
            streams.filteredTokenStream = new SnowballFilter(streams.filteredTokenStream, STEMMER);
        } else {
            CharFilter filter = new LowercaseCharFilter(reader);
            filter = new MappingCharFilter(RECOVERY_MAP, filter);
            streams.tokenStream.reset(filter);
        }
        streams.tokenStream.setMaxTokenLength(maxTokenLength);

        streams.tokenStream.setReplaceInvalidAcronym(replaceInvalidAcronym);

        return streams.filteredTokenStream;
    }

    public static void main(String[] args) {
        Analyzer ana = new FtpFilePathAnalyzer();
        String test2 = "c++c++";
        StringReader reader = new StringReader(test2);
        TokenStream ts = ana.tokenStream("path", reader);
        try {
            while (ts.incrementToken()) {
                TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class);
                OffsetAttribute offsetAtt = (OffsetAttribute) ts.getAttribute(OffsetAttribute.class);
                PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) ts
                        .getAttribute(PositionIncrementAttribute.class);
                TypeAttribute typeAtt = (TypeAttribute) ts.getAttribute(TypeAttribute.class);
                System.out.print("(" + offsetAtt.startOffset() + "," + offsetAtt.endOffset() + ") ["
                        + posIncrAtt.getPositionIncrement() + "," + typeAtt.type() + "] " + "[" + termAtt.term()
                        + "]");
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}