List of usage examples for org.apache.lucene.analysis.it ItalianAnalyzer ItalianAnalyzer
public ItalianAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet)
From source file:com.tilab.ca.sse.core.lucene.IndexesUtil.java
License:Open Source License
/** * Initialize the classifiers. This static method initializes the italian * and the english classifiers under the hood. You must call this function * after you have constructed an instance of the SSEVariables class as * described in SSEVariables docs.//from w w w.j a va2s . c o m * * If you don't call this method, when you use the classifier you will get a * NullPointerException in Classifier(). * * @since 2.0.0.0. */ public static void init() { LOG.debug("[initializator] - BEGIN"); sseConfigFromCache = ConfigCache.getOrCreate(SSEConfig.class); ITALIAN_CORPUS_INDEX_SEARCHER = indexLoading(() -> { // build italian searcher Directory contextIndexDirIT = LuceneManager.pickDirectory(new File(sseConfigFromCache.corpusIndexIT())); LOG.info("Corpus index used for italian: " + contextIndexDirIT); LuceneManager contextLuceneManagerIT = new LuceneManager(contextIndexDirIT); contextLuceneManagerIT.setLuceneDefaultAnalyzer( new ItalianAnalyzer(Version.LUCENE_36, getStopWords(sseConfigFromCache.stopWordsIT()))); return new SimpleSearcher(contextLuceneManagerIT); }).orElse(null); //FIXME not a good use of Optional -> use a default SimpleSearcher ENGLISH_CORPUS_INDEX_SEARCHER = indexLoading(() -> { // build english searcher Directory contextIndexDirEN = LuceneManager.pickDirectory(new File(sseConfigFromCache.corpusIndexEN())); LOG.info("Corpus index used for english: " + contextIndexDirEN); LuceneManager contextLuceneManagerEN = new LuceneManager(contextIndexDirEN); contextLuceneManagerEN.setLuceneDefaultAnalyzer( new EnglishAnalyzer(Version.LUCENE_36, getStopWords(sseConfigFromCache.stopWordsEN()))); return new SimpleSearcher(contextLuceneManagerEN); }).orElse(null); //FIXME not a good use of Optional -> use a default SimpleSearcher if (ITALIAN_CORPUS_INDEX_SEARCHER == null && ENGLISH_CORPUS_INDEX_SEARCHER == null) { throw new RuntimeException("Indexes not available"); } LOG.debug("[initializator] - END"); }
From source file:it.polito.tellmefirst.lucene.IndexesUtil.java
License:Open Source License
public IndexesUtil() throws TMFIndexesWarmUpException { LOG.debug("[constructor] - BEGIN"); try {//w w w . j a va 2 s. c om // build italian searcher Directory contextIndexDirIT = LuceneManager.pickDirectory(new File(TMFVariables.CORPUS_INDEX_IT)); LOG.info("Corpus index used for italian: " + contextIndexDirIT); LuceneManager contextLuceneManagerIT = new LuceneManager(contextIndexDirIT); contextLuceneManagerIT .setLuceneDefaultAnalyzer(new ItalianAnalyzer(Version.LUCENE_36, TMFVariables.STOPWORDS_IT)); ITALIAN_CORPUS_INDEX_SEARCHER = new SimpleSearcher(contextLuceneManagerIT); // build english searcher Directory contextIndexDirEN = LuceneManager.pickDirectory(new File(TMFVariables.CORPUS_INDEX_EN)); LOG.info("Corpus index used for english: " + contextIndexDirEN); LuceneManager contextLuceneManagerEN = new LuceneManager(contextIndexDirEN); contextLuceneManagerEN .setLuceneDefaultAnalyzer(new EnglishAnalyzer(Version.LUCENE_36, TMFVariables.STOPWORDS_EN)); ENGLISH_CORPUS_INDEX_SEARCHER = new SimpleSearcher(contextLuceneManagerIT); } catch (Exception e) { //exceptions are not catched here, because we want to stop TMF server throw new TMFIndexesWarmUpException("Problem with setting up TMF indexes: ", e); } LOG.debug("[constructor] - END"); }
From source file:it.polito.tellmefirst.web.rest.TMFServer.java
License:Open Source License
/** * TMF starting point. From rest directory, launch this command: * mvn exec:java -Dexec.mainClass="it.polito.temefirst.web.rest.TMFServer" -Dexec.args="<path_to_TMF_installation>/conf/server.properties" * or use the run.sh file in bin directory */// w ww.j a v a 2s . c o m public static void main(String[] args) throws TMFConfigurationException, TMFIndexesWarmUpException, URISyntaxException, InterruptedException, IOException { LOG.debug("[main] - BEGIN"); URI serverURI = new URI("http://localhost:2222/rest/"); String configFileName = args[0]; new TMFVariables(configFileName); // XXX I put the code of IndexUtil.init() here, because, for now, I need a reference of SimpleSearchers for the Enhancer // build italian searcher Directory contextIndexDirIT = LuceneManager.pickDirectory(new File(TMFVariables.CORPUS_INDEX_IT)); LOG.info("Corpus index used for italian: " + contextIndexDirIT); LuceneManager contextLuceneManagerIT = new LuceneManager(contextIndexDirIT); contextLuceneManagerIT .setLuceneDefaultAnalyzer(new ItalianAnalyzer(Version.LUCENE_36, TMFVariables.STOPWORDS_IT)); ITALIAN_CORPUS_INDEX_SEARCHER = new SimpleSearcher(contextLuceneManagerIT); // build english searcher Directory contextIndexDirEN = LuceneManager.pickDirectory(new File(TMFVariables.CORPUS_INDEX_EN)); LOG.info("Corpus index used for english: " + contextIndexDirEN); LuceneManager contextLuceneManagerEN = new LuceneManager(contextIndexDirEN); contextLuceneManagerEN .setLuceneDefaultAnalyzer(new EnglishAnalyzer(Version.LUCENE_36, TMFVariables.STOPWORDS_EN)); ENGLISH_CORPUS_INDEX_SEARCHER = new SimpleSearcher(contextLuceneManagerEN); // build kb italian searcher String kbDirIT = TMFVariables.KB_IT; String residualKbDirIT = TMFVariables.RESIDUAL_KB_IT; ITALIAN_KB_INDEX_SEARCHER = new KBIndexSearcher(kbDirIT, residualKbDirIT); // build kb english searcher String kbDirEN = TMFVariables.KB_EN; String residualKbDirEN = TMFVariables.RESIDUAL_KB_EN; ENGLISH_KB_INDEX_SEARCHER = new KBIndexSearcher(kbDirEN, residualKbDirEN); enhancer = new Enhancer(ITALIAN_CORPUS_INDEX_SEARCHER, ENGLISH_CORPUS_INDEX_SEARCHER, ITALIAN_KB_INDEX_SEARCHER, ENGLISH_KB_INDEX_SEARCHER); italianClassifier = new Classifier("it", ITALIAN_CORPUS_INDEX_SEARCHER); englishClassifier = new Classifier("en", ENGLISH_CORPUS_INDEX_SEARCHER); //The following is adapted from DBpedia Spotlight (https://github.com/dbpedia-spotlight/dbpedia-spotlight) final Map<String, String> initParams = new HashMap<String, String>(); initParams.put("com.sun.jersey.config.property.resourceConfigClass", "com.sun.jersey.api.core." + "PackagesResourceConfig"); initParams.put("com.sun.jersey.config.property.packages", "it.polito.tellmefirst.web.rest.services"); initParams.put("com.sun.jersey.config.property.WadlGeneratorConfig", "it.polito.tellmefirst.web.rest.wadl." + "ExternalUriWadlGeneratorConfig"); SelectorThread threadSelector = GrizzlyWebContainerFactory.create(serverURI, initParams); threadSelector.start(); System.err.println("Server started in " + System.getProperty("user.dir") + " listening on " + serverURI); Thread warmUp = new Thread() { public void run() { } }; warmUp.start(); while (running) { Thread.sleep(100); } threadSelector.stopEndpoint(); System.exit(0); LOG.debug("[main] - END"); }
From source file:org.elasticsearch.analysis.common.ItalianAnalyzerProvider.java
License:Apache License
ItalianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);
analyzer = new ItalianAnalyzer(Analysis.parseStopWords(env, settings, ItalianAnalyzer.getDefaultStopSet()),
Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
analyzer.setVersion(version);// w w w. ja va 2s .c om
}
From source file:org.omegat.tokenizer.LuceneItalianTokenizer.java
License:Open Source License
@Override protected TokenStream getTokenStream(final String strOrig, final boolean stemsAllowed, final boolean stopWordsAllowed) { if (stemsAllowed) { Set<?> stopWords = stopWordsAllowed ? ItalianAnalyzer.getDefaultStopSet() : Collections.EMPTY_SET; return new ItalianAnalyzer(getBehavior(), stopWords).tokenStream("", new StringReader(strOrig)); } else {/*from ww w . j a va2 s . co m*/ return new StandardTokenizer(getBehavior(), new StringReader(strOrig)); } }