Example usage for org.apache.lucene.analysis.custom CustomAnalyzer builder

List of usage examples for org.apache.lucene.analysis.custom CustomAnalyzer builder

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.custom CustomAnalyzer builder.

Prototype

public static Builder builder() 

Source Link

Document

Returns a builder for custom analyzers that loads all resources from Lucene's classloader.

Usage

From source file:com.github.tteofili.looseen.MinHashClassifier.java

License:Apache License

public static Analyzer createMinHashAnalyzer(int min, int hashCount, int hashSetSize) throws IOException {
    Map<String, String> sffargs = new HashMap<>();
    sffargs.put("minShingleSize", "" + min);
    sffargs.put("maxShingleSize", "" + min);
    sffargs.put("outputUnigrams", "false");
    sffargs.put("outputUnigramsIfNoShingles", "false");
    sffargs.put("tokenSeparator", " ");
    HashMap<String, String> lshffargs = new HashMap<>();
    lshffargs.put("hashCount", "" + hashCount);
    lshffargs.put("hashSetSize", "" + hashSetSize);
    CustomAnalyzer.Builder builder = CustomAnalyzer.builder().withTokenizer(WhitespaceTokenizerFactory.class)
            .addTokenFilter(ShingleFilterFactory.class, sffargs)
            .addTokenFilter(MinHashFilterFactory.class, lshffargs);

    return builder.build();
}

From source file:com.twentyn.patentSearch.DocumentIndexer.java

License:Open Source License

public static void main(String[] args) throws Exception {
    System.out.println("Starting up...");
    System.out.flush();/*from w  ww  .ja v a 2s  . c  om*/
    Options opts = new Options();
    opts.addOption(Option.builder("i").longOpt("input").hasArg().required()
            .desc("Input file or directory to index").build());
    opts.addOption(Option.builder("x").longOpt("index").hasArg().required()
            .desc("Path to index file to generate").build());
    opts.addOption(Option.builder("h").longOpt("help").desc("Print this help message and exit").build());
    opts.addOption(Option.builder("v").longOpt("verbose").desc("Print verbose log output").build());

    HelpFormatter helpFormatter = new HelpFormatter();
    CommandLineParser cmdLineParser = new DefaultParser();
    CommandLine cmdLine = null;
    try {
        cmdLine = cmdLineParser.parse(opts, args);
    } catch (ParseException e) {
        System.out.println("Caught exception when parsing command line: " + e.getMessage());
        helpFormatter.printHelp("DocumentIndexer", opts);
        System.exit(1);
    }

    if (cmdLine.hasOption("help")) {
        helpFormatter.printHelp("DocumentIndexer", opts);
        System.exit(0);
    }

    if (cmdLine.hasOption("verbose")) {
        // With help from http://stackoverflow.com/questions/23434252/programmatically-change-log-level-in-log4j2
        LoggerContext ctx = (LoggerContext) LogManager.getContext(false);
        Configuration ctxConfig = ctx.getConfiguration();
        LoggerConfig logConfig = ctxConfig.getLoggerConfig(LogManager.ROOT_LOGGER_NAME);
        logConfig.setLevel(Level.DEBUG);

        ctx.updateLoggers();
        LOGGER.debug("Verbose logging enabled");
    }

    LOGGER.info("Opening index at " + cmdLine.getOptionValue("index"));
    Directory indexDir = FSDirectory.open(new File(cmdLine.getOptionValue("index")).toPath());

    /* The standard analyzer is too aggressive with chemical entities (it strips structural annotations, for one
     * thing), and the whitespace analyzer doesn't do any case normalization or stop word elimination.  This custom
     * analyzer appears to treat chemical entities better than the standard analyzer without admitting too much
     * cruft to the index. */
    Analyzer analyzer = CustomAnalyzer.builder().withTokenizer("whitespace").addTokenFilter("lowercase")
            .addTokenFilter("stop").build();

    IndexWriterConfig writerConfig = new IndexWriterConfig(analyzer);
    writerConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
    writerConfig.setRAMBufferSizeMB(1 << 10);
    IndexWriter indexWriter = new IndexWriter(indexDir, writerConfig);

    String inputFileOrDir = cmdLine.getOptionValue("input");
    File splitFileOrDir = new File(inputFileOrDir);
    if (!(splitFileOrDir.exists())) {
        LOGGER.error("Unable to find directory at " + inputFileOrDir);
        System.exit(1);
    }

    DocumentIndexer indexer = new DocumentIndexer(indexWriter);
    PatentCorpusReader corpusReader = new PatentCorpusReader(indexer, splitFileOrDir);
    corpusReader.readPatentCorpus();
    indexer.commitAndClose();
}

From source file:io.anserini.index.IndexClueWeb09b.java

License:Apache License

/**
 * KStemAnalyzer: Filters {@link ClassicTokenizer} with {@link org.apache.lucene.analysis.standard.ClassicFilter},
 * {@link org.apache.lucene.analysis.core.LowerCaseFilter} and {@link org.apache.lucene.analysis.en.KStemFilter}.
 *
 * @return KStemAnalyzer/*w  w w. j  a v  a 2s . co  m*/
 * @throws IOException
 */
public static Analyzer analyzer() throws IOException {
    return CustomAnalyzer.builder().withTokenizer("classic").addTokenFilter("classic")
            .addTokenFilter("lowercase").addTokenFilter("kstem").build();
}

From source file:io.anserini.IndexerCW09B.java

License:Apache License

/**
 * KStemAnalyzer: Filters {@link ClassicTokenizer} with {@link org.apache.lucene.analysis.standard.ClassicFilter},
 * {@link org.apache.lucene.analysis.core.LowerCaseFilter} and {@link org.apache.lucene.analysis.en.KStemFilter}.
 *
 * @return KStemAnalyzer//from w  w w. jav a  2  s .  com
 * @throws IOException
 */
static Analyzer analyzer() throws IOException {
    return CustomAnalyzer.builder().withTokenizer("classic").addTokenFilter("classic")
            .addTokenFilter("lowercase").addTokenFilter("kstem").build();
}

From source file:it.cnr.ilc.lc.claviusweb.ClaviusSearch.java

private static List<Annotation> fullTextSearch(String term)
        throws IOException, ParseException, InvalidTokenOffsetsException {

    log.info("fullTextSearch (" + term + ")");
    List<Annotation> result = new ArrayList<>();

    try {//  w  w  w.j a v  a2 s.  co m
        Directory indexDirectory = FSDirectory
                .open(Paths.get("/var/lucene/clavius-1.0.5/indexes/it.cnr.ilc.lc.claviusweb.entity.PlainText"));
        DirectoryReader ireader = DirectoryReader.open(indexDirectory);

        IndexSearcher searcher = new IndexSearcher(ireader);

        Analyzer fullTextAnalyzer = CustomAnalyzer.builder()
                .addCharFilter("patternReplace", "pattern", "([\\-\\(\\)\\[\\],\\.;:])", "replacement", " $1 ")
                .withTokenizer("whitespace").build();

        //QueryParser parserTerm = new QueryParser("content", fullTextAnalyzer);
        //            AnalyzingQueryParser parser = new AnalyzingQueryParser("content", fullTextAnalyzer);
        //            Query query2 = parser.parse(term);
        //            
        Query query = new WildcardQuery(new Term("content", term));
        TopDocs hits = searcher.search(query, MAX_SEARCH_HITS);

        SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter();
        //Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(query));
        ClaviusHighlighter highlighter = new ClaviusHighlighter(htmlFormatter, new QueryScorer(query));
        highlighter.setTextFragmenter(new SimpleFragmenter());

        log.info("hits.totalHits=(" + hits.totalHits + ")");
        for (int i = 0; i < hits.totalHits; i++) {
            int id = hits.scoreDocs[i].doc;
            Document doc = searcher.doc(id);
            String idDoc = doc.get("idDoc");

            //String text = doc.get("content");
            TokenStream tokenStream = TokenSources.getAnyTokenStream(searcher.getIndexReader(), id, "content",
                    fullTextAnalyzer);

            List<Annotation> frag = highlighter.getBestTextClaviusFragments(tokenStream, doc, false, 10);//highlighter.getBestFragments(tokenStream, text, 3, "...");
            for (int j = 0; j < frag.size(); j++) {
                log.debug("idDoc: " + idDoc + ", Annotation[" + j + "] " + frag.get(j).toString());
            }
            result.addAll(frag);
        }
    } catch (InvalidTokenOffsetsException | IOException e) {
        log.error(e);
    }
    log.info("Full Text Search found " + result.size() + " result(s) for term " + term);
    return result;
}

From source file:org.eclipse.che.api.search.server.impl.LuceneSearcher.java

License:Open Source License

@Inject
public LuceneSearcher(@Named("vfs.index_filter_matcher") Set<PathMatcher> excludePatterns,
        @Named("vfs.local.fs_index_root_dir") File indexDirectory,
        @Named("che.user.workspaces.storage") File root, PathTransformer pathTransformer) throws IOException {

    if (indexDirectory.exists()) {
        if (indexDirectory.isFile()) {
            throw new IOException("Wrong configuration `vfs.local.fs_index_root_dir` is a file");
        }//from ww  w .j  a v a2  s  . c om
    } else {
        Files.createDirectories(indexDirectory.toPath());
    }

    this.indexDirectory = indexDirectory;
    this.root = root;
    this.excludePatterns = excludePatterns;
    this.pathTransformer = pathTransformer;
    this.analyzer = CustomAnalyzer.builder().withTokenizer(WhitespaceTokenizerFactory.class)
            .addTokenFilter(LowerCaseFilterFactory.class).build();
    this.luceneIndexWriter = new IndexWriter(
            FSDirectory.open(indexDirectory.toPath(), new SingleInstanceLockFactory()),
            new IndexWriterConfig(analyzer));
    this.searcherManager = new SearcherManager(luceneIndexWriter, true, true, new SearcherFactory());
    this.sort = new Sort(SortField.FIELD_SCORE, new SortField(PATH_FIELD, SortField.Type.STRING));
}

From source file:org.quelea.services.lucene.BibleSearchIndex.java

License:Open Source License

/**
 * Create a new empty search index.//from   w  w  w.j a v  a2  s .  c o  m
 */
public BibleSearchIndex() {
    chapters = new HashMap<>();
    try {
        analyzer = CustomAnalyzer.builder().withTokenizer(StandardTokenizerFactory.class)
                .addTokenFilter(LowerCaseFilterFactory.class).addTokenFilter(ASCIIFoldingFilterFactory.class)
                .build();
        index = new MMapDirectory(Files.createTempDirectory("quelea-mmap-bible").toAbsolutePath());
    } catch (IOException ex) {
        LOGGER.log(Level.SEVERE, "Couldn't create song search index");
        throw new RuntimeException("Couldn't create song search index", ex);
    }
}

From source file:org.quelea.services.lucene.SongSearchIndex.java

License:Open Source License

/**
 * Create a new empty search index.//www .  j  a va2  s. c  om
 */
public SongSearchIndex() {
    songs = new HashMap<>();
    try {
        analyzer = CustomAnalyzer.builder().withTokenizer(StandardTokenizerFactory.class)
                .addTokenFilter(LowerCaseFilterFactory.class).addTokenFilter(ASCIIFoldingFilterFactory.class)
                .build();
        index = new MMapDirectory(Files.createTempDirectory("quelea-mmap-song").toAbsolutePath());
    } catch (IOException ex) {
        LOGGER.log(Level.SEVERE, "Couldn't create song search index");
        throw new RuntimeException("Couldn't create song search index", ex);
    }
}

From source file:org.zephyrsoft.sdb2.service.IndexerServiceImpl.java

License:Open Source License

@Override
public void index(final IndexType indexType, final Collection<Song> songs) {
    executor.execute(new Runnable() {
        @Override/*w ww  . j ava2  s  .c  o  m*/
        public void run() {
            Stopwatch stopwatch = Stopwatch.createStarted();

            Directory directory = new RAMDirectory();
            try {
                LOG.debug("available tokenizers: {}", TokenizerFactory.availableTokenizers());
                LOG.debug("available token filters: {}", TokenFilterFactory.availableTokenFilters());
                Analyzer analyzer = CustomAnalyzer.builder().withTokenizer("standard")
                        .addTokenFilter("lowercase")
                        .addTokenFilter("ngram", "minGramSize", "1", "maxGramSize", "25").build();
                IndexWriterConfig config = new IndexWriterConfig(analyzer);
                try (IndexWriter writer = new IndexWriter(directory, config)) {
                    for (Song song : songs) {
                        Document document = createDocument(song);
                        writer.addDocument(document);
                        songByUuid.put(song.getUUID(), song);
                    }
                } catch (IOException e) {
                    LOG.warn("couldn't index songs", e);
                }
            } catch (IOException e1) {
                LOG.warn("couldn't create analyzer", e1);
            } finally {
                putIndex(indexType, directory);
                stopwatch.stop();
                LOG.info("indexing songs in background thread took {}", stopwatch.toString());
            }
        }
    });
}

From source file:tw.com.kyle.luminance.AnnotAnalyzerFactory.java

public static Analyzer Get(AnnotAnalyzerEnum an_enum) {
    Analyzer ana = null;/*from   www  .ja  va 2s  .c om*/
    try {

        CustomAnalyzer.Builder builder = CustomAnalyzer.builder();
        switch (an_enum) {
        case RangeAnnotAnalyzer:
            Map<String, String> params = new HashMap<>();
            Map<String, String> fparams = new HashMap<>();
            params.put("pattern", "\\((\\d+,\\d+,[^)]*)\\)");
            params.put("group", "1");

            ana = builder.withTokenizer(OffsetTokenizerFactory.class, params)
                    .addTokenFilter(OffsetTokenFilterFactory.class, fparams).build();
            break;
        case TokenAnnotAnalyzer:
            //! TokenAnnotAnalyzer at best is a convenient interface of
            //! RangeAnnotAnalyzer. That is, Token is just a lenth-1-range
            //! There is no need to specifically write another anlayzer for that.
            throw new UnsupportedOperationException("Not implemented");
        default:
            ana = new StandardAnalyzer();
        }

        return ana;
    } catch (IOException ex) {
        Logger.getLogger(AnnotAnalyzerFactory.class.getName()).log(Level.SEVERE, null, ex);
    }

    if (ana == null) {
        ana = new StandardAnalyzer();
    }

    return ana;
}