Example usage for org.apache.lucene.analysis.it ItalianAnalyzer ItalianAnalyzer

List of usage examples for org.apache.lucene.analysis.it ItalianAnalyzer ItalianAnalyzer

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.it ItalianAnalyzer ItalianAnalyzer.

Prototype

public ItalianAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) 

Source Link

Document

Builds an analyzer with the given stop words.

Usage

From source file:com.tilab.ca.sse.core.lucene.IndexesUtil.java

License:Open Source License

/**
 * Initialize the classifiers. This static method initializes the italian
 * and the english classifiers under the hood. You must call this function
 * after you have constructed an instance of the SSEVariables class as
 * described in SSEVariables docs.//from w  w w.j a va2s  . c  o m
 *
 * If you don't call this method, when you use the classifier you will get a
 * NullPointerException in Classifier().
 *
 * @since 2.0.0.0.
 */
public static void init() {
    LOG.debug("[initializator] - BEGIN");

    sseConfigFromCache = ConfigCache.getOrCreate(SSEConfig.class);

    ITALIAN_CORPUS_INDEX_SEARCHER = indexLoading(() -> {
        // build italian searcher
        Directory contextIndexDirIT = LuceneManager.pickDirectory(new File(sseConfigFromCache.corpusIndexIT()));
        LOG.info("Corpus index used for italian: " + contextIndexDirIT);
        LuceneManager contextLuceneManagerIT = new LuceneManager(contextIndexDirIT);
        contextLuceneManagerIT.setLuceneDefaultAnalyzer(
                new ItalianAnalyzer(Version.LUCENE_36, getStopWords(sseConfigFromCache.stopWordsIT())));
        return new SimpleSearcher(contextLuceneManagerIT);
    }).orElse(null); //FIXME not a good use of Optional -> use a default SimpleSearcher

    ENGLISH_CORPUS_INDEX_SEARCHER = indexLoading(() -> {
        // build english searcher
        Directory contextIndexDirEN = LuceneManager.pickDirectory(new File(sseConfigFromCache.corpusIndexEN()));
        LOG.info("Corpus index used for english: " + contextIndexDirEN);
        LuceneManager contextLuceneManagerEN = new LuceneManager(contextIndexDirEN);
        contextLuceneManagerEN.setLuceneDefaultAnalyzer(
                new EnglishAnalyzer(Version.LUCENE_36, getStopWords(sseConfigFromCache.stopWordsEN())));
        return new SimpleSearcher(contextLuceneManagerEN);
    }).orElse(null); //FIXME not a good use of Optional -> use a default SimpleSearcher

    if (ITALIAN_CORPUS_INDEX_SEARCHER == null && ENGLISH_CORPUS_INDEX_SEARCHER == null) {
        throw new RuntimeException("Indexes not available");
    }

    LOG.debug("[initializator] - END");
}

From source file:it.polito.tellmefirst.lucene.IndexesUtil.java

License:Open Source License

public IndexesUtil() throws TMFIndexesWarmUpException {
    LOG.debug("[constructor] - BEGIN");
    try {//w  w  w . j a va 2  s. c om
        // build italian searcher
        Directory contextIndexDirIT = LuceneManager.pickDirectory(new File(TMFVariables.CORPUS_INDEX_IT));
        LOG.info("Corpus index used for italian: " + contextIndexDirIT);
        LuceneManager contextLuceneManagerIT = new LuceneManager(contextIndexDirIT);
        contextLuceneManagerIT
                .setLuceneDefaultAnalyzer(new ItalianAnalyzer(Version.LUCENE_36, TMFVariables.STOPWORDS_IT));
        ITALIAN_CORPUS_INDEX_SEARCHER = new SimpleSearcher(contextLuceneManagerIT);

        // build english searcher
        Directory contextIndexDirEN = LuceneManager.pickDirectory(new File(TMFVariables.CORPUS_INDEX_EN));
        LOG.info("Corpus index used for english: " + contextIndexDirEN);
        LuceneManager contextLuceneManagerEN = new LuceneManager(contextIndexDirEN);
        contextLuceneManagerEN
                .setLuceneDefaultAnalyzer(new EnglishAnalyzer(Version.LUCENE_36, TMFVariables.STOPWORDS_EN));
        ENGLISH_CORPUS_INDEX_SEARCHER = new SimpleSearcher(contextLuceneManagerIT);
    } catch (Exception e) {
        //exceptions are not catched here, because we want to stop TMF server
        throw new TMFIndexesWarmUpException("Problem with setting up TMF indexes: ", e);
    }
    LOG.debug("[constructor] - END");
}

From source file:it.polito.tellmefirst.web.rest.TMFServer.java

License:Open Source License

/**
 * TMF starting point. From rest directory, launch this command:
 * mvn exec:java -Dexec.mainClass="it.polito.temefirst.web.rest.TMFServer" -Dexec.args="<path_to_TMF_installation>/conf/server.properties"
 * or use the run.sh file in bin directory
 *///  w ww.j  a v a 2s . c o m
public static void main(String[] args) throws TMFConfigurationException, TMFIndexesWarmUpException,
        URISyntaxException, InterruptedException, IOException {
    LOG.debug("[main] - BEGIN");
    URI serverURI = new URI("http://localhost:2222/rest/");
    String configFileName = args[0];
    new TMFVariables(configFileName);

    // XXX I put the code of IndexUtil.init() here, because, for now, I need a reference of SimpleSearchers for the Enhancer

    // build italian searcher
    Directory contextIndexDirIT = LuceneManager.pickDirectory(new File(TMFVariables.CORPUS_INDEX_IT));
    LOG.info("Corpus index used for italian: " + contextIndexDirIT);
    LuceneManager contextLuceneManagerIT = new LuceneManager(contextIndexDirIT);
    contextLuceneManagerIT
            .setLuceneDefaultAnalyzer(new ItalianAnalyzer(Version.LUCENE_36, TMFVariables.STOPWORDS_IT));
    ITALIAN_CORPUS_INDEX_SEARCHER = new SimpleSearcher(contextLuceneManagerIT);

    // build english searcher
    Directory contextIndexDirEN = LuceneManager.pickDirectory(new File(TMFVariables.CORPUS_INDEX_EN));
    LOG.info("Corpus index used for english: " + contextIndexDirEN);
    LuceneManager contextLuceneManagerEN = new LuceneManager(contextIndexDirEN);
    contextLuceneManagerEN
            .setLuceneDefaultAnalyzer(new EnglishAnalyzer(Version.LUCENE_36, TMFVariables.STOPWORDS_EN));
    ENGLISH_CORPUS_INDEX_SEARCHER = new SimpleSearcher(contextLuceneManagerEN);

    // build kb italian searcher
    String kbDirIT = TMFVariables.KB_IT;
    String residualKbDirIT = TMFVariables.RESIDUAL_KB_IT;
    ITALIAN_KB_INDEX_SEARCHER = new KBIndexSearcher(kbDirIT, residualKbDirIT);

    // build kb english searcher
    String kbDirEN = TMFVariables.KB_EN;
    String residualKbDirEN = TMFVariables.RESIDUAL_KB_EN;
    ENGLISH_KB_INDEX_SEARCHER = new KBIndexSearcher(kbDirEN, residualKbDirEN);

    enhancer = new Enhancer(ITALIAN_CORPUS_INDEX_SEARCHER, ENGLISH_CORPUS_INDEX_SEARCHER,
            ITALIAN_KB_INDEX_SEARCHER, ENGLISH_KB_INDEX_SEARCHER);

    italianClassifier = new Classifier("it", ITALIAN_CORPUS_INDEX_SEARCHER);
    englishClassifier = new Classifier("en", ENGLISH_CORPUS_INDEX_SEARCHER);

    //The following is adapted from DBpedia Spotlight (https://github.com/dbpedia-spotlight/dbpedia-spotlight)
    final Map<String, String> initParams = new HashMap<String, String>();
    initParams.put("com.sun.jersey.config.property.resourceConfigClass",
            "com.sun.jersey.api.core." + "PackagesResourceConfig");
    initParams.put("com.sun.jersey.config.property.packages", "it.polito.tellmefirst.web.rest.services");
    initParams.put("com.sun.jersey.config.property.WadlGeneratorConfig",
            "it.polito.tellmefirst.web.rest.wadl." + "ExternalUriWadlGeneratorConfig");
    SelectorThread threadSelector = GrizzlyWebContainerFactory.create(serverURI, initParams);
    threadSelector.start();
    System.err.println("Server started in " + System.getProperty("user.dir") + " listening on " + serverURI);
    Thread warmUp = new Thread() {
        public void run() {
        }
    };
    warmUp.start();
    while (running) {
        Thread.sleep(100);
    }
    threadSelector.stopEndpoint();
    System.exit(0);
    LOG.debug("[main] - END");
}

From source file:org.elasticsearch.analysis.common.ItalianAnalyzerProvider.java

License:Apache License

ItalianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    analyzer = new ItalianAnalyzer(Analysis.parseStopWords(env, settings, ItalianAnalyzer.getDefaultStopSet()),
            Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);// w  w  w. ja va  2s  .c  om
}

From source file:org.omegat.tokenizer.LuceneItalianTokenizer.java

License:Open Source License

@Override
protected TokenStream getTokenStream(final String strOrig, final boolean stemsAllowed,
        final boolean stopWordsAllowed) {
    if (stemsAllowed) {
        Set<?> stopWords = stopWordsAllowed ? ItalianAnalyzer.getDefaultStopSet() : Collections.EMPTY_SET;
        return new ItalianAnalyzer(getBehavior(), stopWords).tokenStream("", new StringReader(strOrig));
    } else {/*from  ww w . j  a  va2 s . co m*/
        return new StandardTokenizer(getBehavior(), new StringReader(strOrig));
    }
}