List of usage examples for org.apache.lucene.analysis.custom CustomAnalyzer builder
public static Builder builder()
From source file:com.github.tteofili.looseen.MinHashClassifier.java
License:Apache License
public static Analyzer createMinHashAnalyzer(int min, int hashCount, int hashSetSize) throws IOException { Map<String, String> sffargs = new HashMap<>(); sffargs.put("minShingleSize", "" + min); sffargs.put("maxShingleSize", "" + min); sffargs.put("outputUnigrams", "false"); sffargs.put("outputUnigramsIfNoShingles", "false"); sffargs.put("tokenSeparator", " "); HashMap<String, String> lshffargs = new HashMap<>(); lshffargs.put("hashCount", "" + hashCount); lshffargs.put("hashSetSize", "" + hashSetSize); CustomAnalyzer.Builder builder = CustomAnalyzer.builder().withTokenizer(WhitespaceTokenizerFactory.class) .addTokenFilter(ShingleFilterFactory.class, sffargs) .addTokenFilter(MinHashFilterFactory.class, lshffargs); return builder.build(); }
From source file:com.twentyn.patentSearch.DocumentIndexer.java
License:Open Source License
public static void main(String[] args) throws Exception { System.out.println("Starting up..."); System.out.flush();/*from w ww .ja v a 2s . c om*/ Options opts = new Options(); opts.addOption(Option.builder("i").longOpt("input").hasArg().required() .desc("Input file or directory to index").build()); opts.addOption(Option.builder("x").longOpt("index").hasArg().required() .desc("Path to index file to generate").build()); opts.addOption(Option.builder("h").longOpt("help").desc("Print this help message and exit").build()); opts.addOption(Option.builder("v").longOpt("verbose").desc("Print verbose log output").build()); HelpFormatter helpFormatter = new HelpFormatter(); CommandLineParser cmdLineParser = new DefaultParser(); CommandLine cmdLine = null; try { cmdLine = cmdLineParser.parse(opts, args); } catch (ParseException e) { System.out.println("Caught exception when parsing command line: " + e.getMessage()); helpFormatter.printHelp("DocumentIndexer", opts); System.exit(1); } if (cmdLine.hasOption("help")) { helpFormatter.printHelp("DocumentIndexer", opts); System.exit(0); } if (cmdLine.hasOption("verbose")) { // With help from http://stackoverflow.com/questions/23434252/programmatically-change-log-level-in-log4j2 LoggerContext ctx = (LoggerContext) LogManager.getContext(false); Configuration ctxConfig = ctx.getConfiguration(); LoggerConfig logConfig = ctxConfig.getLoggerConfig(LogManager.ROOT_LOGGER_NAME); logConfig.setLevel(Level.DEBUG); ctx.updateLoggers(); LOGGER.debug("Verbose logging enabled"); } LOGGER.info("Opening index at " + cmdLine.getOptionValue("index")); Directory indexDir = FSDirectory.open(new File(cmdLine.getOptionValue("index")).toPath()); /* The standard analyzer is too aggressive with chemical entities (it strips structural annotations, for one * thing), and the whitespace analyzer doesn't do any case normalization or stop word elimination. This custom * analyzer appears to treat chemical entities better than the standard analyzer without admitting too much * cruft to the index. */ Analyzer analyzer = CustomAnalyzer.builder().withTokenizer("whitespace").addTokenFilter("lowercase") .addTokenFilter("stop").build(); IndexWriterConfig writerConfig = new IndexWriterConfig(analyzer); writerConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); writerConfig.setRAMBufferSizeMB(1 << 10); IndexWriter indexWriter = new IndexWriter(indexDir, writerConfig); String inputFileOrDir = cmdLine.getOptionValue("input"); File splitFileOrDir = new File(inputFileOrDir); if (!(splitFileOrDir.exists())) { LOGGER.error("Unable to find directory at " + inputFileOrDir); System.exit(1); } DocumentIndexer indexer = new DocumentIndexer(indexWriter); PatentCorpusReader corpusReader = new PatentCorpusReader(indexer, splitFileOrDir); corpusReader.readPatentCorpus(); indexer.commitAndClose(); }
From source file:io.anserini.index.IndexClueWeb09b.java
License:Apache License
/** * KStemAnalyzer: Filters {@link ClassicTokenizer} with {@link org.apache.lucene.analysis.standard.ClassicFilter}, * {@link org.apache.lucene.analysis.core.LowerCaseFilter} and {@link org.apache.lucene.analysis.en.KStemFilter}. * * @return KStemAnalyzer/*w w w. j a v a 2s . co m*/ * @throws IOException */ public static Analyzer analyzer() throws IOException { return CustomAnalyzer.builder().withTokenizer("classic").addTokenFilter("classic") .addTokenFilter("lowercase").addTokenFilter("kstem").build(); }
From source file:io.anserini.IndexerCW09B.java
License:Apache License
/** * KStemAnalyzer: Filters {@link ClassicTokenizer} with {@link org.apache.lucene.analysis.standard.ClassicFilter}, * {@link org.apache.lucene.analysis.core.LowerCaseFilter} and {@link org.apache.lucene.analysis.en.KStemFilter}. * * @return KStemAnalyzer//from w w w. jav a 2 s . com * @throws IOException */ static Analyzer analyzer() throws IOException { return CustomAnalyzer.builder().withTokenizer("classic").addTokenFilter("classic") .addTokenFilter("lowercase").addTokenFilter("kstem").build(); }
From source file:it.cnr.ilc.lc.claviusweb.ClaviusSearch.java
private static List<Annotation> fullTextSearch(String term) throws IOException, ParseException, InvalidTokenOffsetsException { log.info("fullTextSearch (" + term + ")"); List<Annotation> result = new ArrayList<>(); try {// w w w.j a v a2 s. co m Directory indexDirectory = FSDirectory .open(Paths.get("/var/lucene/clavius-1.0.5/indexes/it.cnr.ilc.lc.claviusweb.entity.PlainText")); DirectoryReader ireader = DirectoryReader.open(indexDirectory); IndexSearcher searcher = new IndexSearcher(ireader); Analyzer fullTextAnalyzer = CustomAnalyzer.builder() .addCharFilter("patternReplace", "pattern", "([\\-\\(\\)\\[\\],\\.;:])", "replacement", " $1 ") .withTokenizer("whitespace").build(); //QueryParser parserTerm = new QueryParser("content", fullTextAnalyzer); // AnalyzingQueryParser parser = new AnalyzingQueryParser("content", fullTextAnalyzer); // Query query2 = parser.parse(term); // Query query = new WildcardQuery(new Term("content", term)); TopDocs hits = searcher.search(query, MAX_SEARCH_HITS); SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter(); //Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(query)); ClaviusHighlighter highlighter = new ClaviusHighlighter(htmlFormatter, new QueryScorer(query)); highlighter.setTextFragmenter(new SimpleFragmenter()); log.info("hits.totalHits=(" + hits.totalHits + ")"); for (int i = 0; i < hits.totalHits; i++) { int id = hits.scoreDocs[i].doc; Document doc = searcher.doc(id); String idDoc = doc.get("idDoc"); //String text = doc.get("content"); TokenStream tokenStream = TokenSources.getAnyTokenStream(searcher.getIndexReader(), id, "content", fullTextAnalyzer); List<Annotation> frag = highlighter.getBestTextClaviusFragments(tokenStream, doc, false, 10);//highlighter.getBestFragments(tokenStream, text, 3, "..."); for (int j = 0; j < frag.size(); j++) { log.debug("idDoc: " + idDoc + ", Annotation[" + j + "] " + frag.get(j).toString()); } result.addAll(frag); } } catch (InvalidTokenOffsetsException | IOException e) { log.error(e); } log.info("Full Text Search found " + result.size() + " result(s) for term " + term); return result; }
From source file:org.eclipse.che.api.search.server.impl.LuceneSearcher.java
License:Open Source License
@Inject public LuceneSearcher(@Named("vfs.index_filter_matcher") Set<PathMatcher> excludePatterns, @Named("vfs.local.fs_index_root_dir") File indexDirectory, @Named("che.user.workspaces.storage") File root, PathTransformer pathTransformer) throws IOException { if (indexDirectory.exists()) { if (indexDirectory.isFile()) { throw new IOException("Wrong configuration `vfs.local.fs_index_root_dir` is a file"); }//from ww w .j a v a2 s . c om } else { Files.createDirectories(indexDirectory.toPath()); } this.indexDirectory = indexDirectory; this.root = root; this.excludePatterns = excludePatterns; this.pathTransformer = pathTransformer; this.analyzer = CustomAnalyzer.builder().withTokenizer(WhitespaceTokenizerFactory.class) .addTokenFilter(LowerCaseFilterFactory.class).build(); this.luceneIndexWriter = new IndexWriter( FSDirectory.open(indexDirectory.toPath(), new SingleInstanceLockFactory()), new IndexWriterConfig(analyzer)); this.searcherManager = new SearcherManager(luceneIndexWriter, true, true, new SearcherFactory()); this.sort = new Sort(SortField.FIELD_SCORE, new SortField(PATH_FIELD, SortField.Type.STRING)); }
From source file:org.quelea.services.lucene.BibleSearchIndex.java
License:Open Source License
/** * Create a new empty search index.//from w w w.j a v a2 s . c o m */ public BibleSearchIndex() { chapters = new HashMap<>(); try { analyzer = CustomAnalyzer.builder().withTokenizer(StandardTokenizerFactory.class) .addTokenFilter(LowerCaseFilterFactory.class).addTokenFilter(ASCIIFoldingFilterFactory.class) .build(); index = new MMapDirectory(Files.createTempDirectory("quelea-mmap-bible").toAbsolutePath()); } catch (IOException ex) { LOGGER.log(Level.SEVERE, "Couldn't create song search index"); throw new RuntimeException("Couldn't create song search index", ex); } }
From source file:org.quelea.services.lucene.SongSearchIndex.java
License:Open Source License
/** * Create a new empty search index.//www . j a va2 s. c om */ public SongSearchIndex() { songs = new HashMap<>(); try { analyzer = CustomAnalyzer.builder().withTokenizer(StandardTokenizerFactory.class) .addTokenFilter(LowerCaseFilterFactory.class).addTokenFilter(ASCIIFoldingFilterFactory.class) .build(); index = new MMapDirectory(Files.createTempDirectory("quelea-mmap-song").toAbsolutePath()); } catch (IOException ex) { LOGGER.log(Level.SEVERE, "Couldn't create song search index"); throw new RuntimeException("Couldn't create song search index", ex); } }
From source file:org.zephyrsoft.sdb2.service.IndexerServiceImpl.java
License:Open Source License
@Override public void index(final IndexType indexType, final Collection<Song> songs) { executor.execute(new Runnable() { @Override/*w ww . j ava2 s .c o m*/ public void run() { Stopwatch stopwatch = Stopwatch.createStarted(); Directory directory = new RAMDirectory(); try { LOG.debug("available tokenizers: {}", TokenizerFactory.availableTokenizers()); LOG.debug("available token filters: {}", TokenFilterFactory.availableTokenFilters()); Analyzer analyzer = CustomAnalyzer.builder().withTokenizer("standard") .addTokenFilter("lowercase") .addTokenFilter("ngram", "minGramSize", "1", "maxGramSize", "25").build(); IndexWriterConfig config = new IndexWriterConfig(analyzer); try (IndexWriter writer = new IndexWriter(directory, config)) { for (Song song : songs) { Document document = createDocument(song); writer.addDocument(document); songByUuid.put(song.getUUID(), song); } } catch (IOException e) { LOG.warn("couldn't index songs", e); } } catch (IOException e1) { LOG.warn("couldn't create analyzer", e1); } finally { putIndex(indexType, directory); stopwatch.stop(); LOG.info("indexing songs in background thread took {}", stopwatch.toString()); } } }); }
From source file:tw.com.kyle.luminance.AnnotAnalyzerFactory.java
public static Analyzer Get(AnnotAnalyzerEnum an_enum) { Analyzer ana = null;/*from www .ja va 2s .c om*/ try { CustomAnalyzer.Builder builder = CustomAnalyzer.builder(); switch (an_enum) { case RangeAnnotAnalyzer: Map<String, String> params = new HashMap<>(); Map<String, String> fparams = new HashMap<>(); params.put("pattern", "\\((\\d+,\\d+,[^)]*)\\)"); params.put("group", "1"); ana = builder.withTokenizer(OffsetTokenizerFactory.class, params) .addTokenFilter(OffsetTokenFilterFactory.class, fparams).build(); break; case TokenAnnotAnalyzer: //! TokenAnnotAnalyzer at best is a convenient interface of //! RangeAnnotAnalyzer. That is, Token is just a lenth-1-range //! There is no need to specifically write another anlayzer for that. throw new UnsupportedOperationException("Not implemented"); default: ana = new StandardAnalyzer(); } return ana; } catch (IOException ex) { Logger.getLogger(AnnotAnalyzerFactory.class.getName()).log(Level.SEVERE, null, ex); } if (ana == null) { ana = new StandardAnalyzer(); } return ana; }