Example usage for org.apache.lucene.analysis.core SimpleAnalyzer SimpleAnalyzer

List of usage examples for org.apache.lucene.analysis.core SimpleAnalyzer SimpleAnalyzer

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.core SimpleAnalyzer SimpleAnalyzer.

Prototype

public SimpleAnalyzer() 

Source Link

Document

Creates a new SimpleAnalyzer

Usage

From source file:at.ac.univie.mminf.luceneSKOS.analysis.engine.jena.SKOSEngineImpl.java

License:Apache License

/**
 * This constructor loads the SKOS model from a given InputStream using the
 * given serialization language parameter, which must be either N3, RDF/XML,
 * or TURTLE./*w w  w . j av  a2s.  com*/
 *
 * @param inputStream the input stream
 * @param lang the serialization language
 * @throws IOException if the model cannot be loaded
 */
public SKOSEngineImpl(InputStream inputStream, String lang) throws IOException {
    if (!("N3".equals(lang) || "RDF/XML".equals(lang) || "TURTLE".equals(lang))) {
        throw new IOException("Invalid RDF serialization format");
    }
    this.analyzer = new SimpleAnalyzer();
    this.skosModel = ModelFactory.createDefaultModel();
    skosModel.read(inputStream, null, lang);
    indexDir = new RAMDirectory();
    entailSKOSModel();
    indexSKOSModel();
    searcher = new IndexSearcher(DirectoryReader.open(indexDir));
}

From source file:at.ac.univie.mminf.luceneSKOS.analysis.engine.jena.SKOSEngineImpl.java

License:Apache License

/**
 * This constructor loads the SKOS model from a given filename or URI,
 * starts the indexing process and sets up the index searcher.
 *
 * @param indexPath index path/*from w ww  .  j a  va  2 s.c o  m*/
 * @param filenameOrURI file name or URI
 * @param languages the languages to be considered
 * @throws IOException if indexing SKOS model fails
 */
public SKOSEngineImpl(String indexPath, String filenameOrURI, List<String> languages) throws IOException {
    this.analyzer = new SimpleAnalyzer();
    String langSig = "";
    if (languages != null) {
        this.languages = new TreeSet<>(languages);
        if (!this.languages.isEmpty()) {
            langSig = "-" + join(this.languages.iterator(), '-');
        }
    }
    String name = getName(filenameOrURI);
    File dir = new File(indexPath + name + langSig);
    this.indexDir = FSDirectory.open(dir.toPath());
    if (filenameOrURI != null) {
        FileManager fileManager = new FileManager();
        fileManager.addLocatorFile();
        fileManager.addLocatorURL();
        fileManager.addLocatorClassLoader(SKOSEngineImpl.class.getClassLoader());
        if (getExtension(filenameOrURI).equals("zip")) {
            fileManager.addLocatorZip(filenameOrURI);
            filenameOrURI = getBaseName(filenameOrURI);
        }
        File inputFile = new File(filenameOrURI);
        Path inputPath = Paths.get(inputFile.getParent(), inputFile.getName());
        skosModel = fileManager.loadModel(inputPath.toUri().toString());
        entailSKOSModel();
        indexSKOSModel();
        searcher = new IndexSearcher(DirectoryReader.open(indexDir));
    }
}

From source file:at.ac.univie.mminf.luceneSKOS.analysis.engine.jena.SKOSEngineImpl.java

License:Apache License

/**
 * This constructor loads the SKOS model from a given InputStream using the
 * given serialization language parameter, which must be either N3, RDF/XML,
 * or TURTLE./*from w  w  w.j av a2 s  . co m*/
 *
 * @param inputStream the input stream
 * @param format the serialization language
 * @param languages the languages
 * @throws IOException if the model cannot be loaded
 */
public SKOSEngineImpl(InputStream inputStream, String format, List<String> languages) throws IOException {
    if (!("N3".equals(format) || "RDF/XML".equals(format) || "TURTLE".equals(format))) {
        throw new IOException("Invalid RDF serialization format");
    }
    if (languages != null) {
        this.languages = new TreeSet<>(languages);
    }
    analyzer = new SimpleAnalyzer();
    skosModel = ModelFactory.createDefaultModel();
    skosModel.read(inputStream, null, format);
    indexDir = new RAMDirectory();
    entailSKOSModel();
    indexSKOSModel();
    searcher = new IndexSearcher(DirectoryReader.open(indexDir));
}

From source file:at.ac.univie.mminf.luceneSKOS.test.SKOSLabelFilterTest.java

License:Apache License

@Test
public void testTermQuery() throws IOException, QueryNodeException {
    Document doc = new Document();
    doc.add(new Field("content", "I work for the united nations", TextField.TYPE_STORED));
    writer.addDocument(doc);/*from w  w  w.j  a v  a  2s.  c  o m*/
    searcher = new IndexSearcher(DirectoryReader.open(writer, false));
    StandardQueryParser parser = new StandardQueryParser(new SimpleAnalyzer());
    Query query = parser.parse("united nations", "content");
    assertEquals(1, searcher.search(query, 1).totalHits);
}

From source file:at.ac.univie.mminf.luceneSKOS.test.termexpansion.AbstractTermExpansionTest.java

License:Apache License

/**
 * This test indexes a sample metadata record (=lucene document) having a
 * "title", "description", and "subject" field, which contains plain subject
 * terms./*from w w  w.  ja v  a  2  s .c  om*/
 * <p/>
 * A search for "arms" doesn't return that record because the term "arms" is
 * not explicitly contained in the record (document).
 *
 * @throws IOException
 * @throws LockObtainFailedException
 * @throws CorruptIndexException
 */
@Test
public void noExpansion() throws IOException {

    /* defining the document to be indexed */
    Document doc = new Document();
    doc.add(new Field("title", "Spearhead", TextField.TYPE_STORED));
    doc.add(new Field("description",
            "Roman iron spearhead. The spearhead was attached to one end of a wooden shaft..."
                    + "The spear was mainly a thrusting weapon, but could also be thrown. "
                    + "It was the principal weapon of the auxiliary soldier... "
                    + "(second - fourth century, Arbeia Roman Fort).",
            TextField.TYPE_NOT_STORED));
    doc.add(new Field("subject", "weapons", TextField.TYPE_NOT_STORED));

    /* setting up a writer with a default (simple) analyzer */
    writer = new IndexWriter(new RAMDirectory(), new IndexWriterConfig(new SimpleAnalyzer()));

    /* adding the document to the index */
    writer.addDocument(doc);

    /* defining a query that searches over all fields */
    BooleanQuery.Builder builder = new BooleanQuery.Builder();
    builder.add(new TermQuery(new Term("title", "arms")), BooleanClause.Occur.SHOULD)
            .add(new TermQuery(new Term("description", "arms")), BooleanClause.Occur.SHOULD)
            .add(new TermQuery(new Term("subject", "arms")), BooleanClause.Occur.SHOULD);

    /* creating a new searcher */
    searcher = new IndexSearcher(DirectoryReader.open(writer, false));

    TopDocs results = searcher.search(builder.build(), 10);

    /* no results are returned since there is no term match */
    assertEquals(0, results.totalHits);
}

From source file:at.ac.univie.mminf.luceneSKOS.test.termexpansion.LabelbasedTermExpansionTest.java

License:Apache License

/**
 * This test indexes a sample metadata record (=lucene document) having a
 * "title", "description", and "subject" field.
 * <p/>/*from   www . ja v  a  2s . c  o  m*/
 * A search for "arms" returns that record as a result because "arms" is
 * defined as an alternative label for "weapons", the term which is
 * contained in the subject field.
 *
 * @throws IOException
 */
@Test
public void labelBasedTermExpansion() throws IOException {

    /* defining the document to be indexed */
    Document doc = new Document();
    doc.add(new Field("title", "Spearhead", TextField.TYPE_STORED));
    doc.add(new Field("description",
            "Roman iron spearhead. The spearhead was attached to one end of a wooden shaft..."
                    + "The spear was mainly a thrusting weapon, but could also be thrown. "
                    + "It was the principal weapon of the auxiliary soldier... "
                    + "(second - fourth century, Arbeia Roman Fort).",
            TextField.TYPE_NOT_STORED));
    doc.add(new Field("subject", "weapons", TextField.TYPE_NOT_STORED));

    /* setting up the SKOS analyzer */
    String skosFile = "src/test/resources/skos_samples/ukat_examples.n3";
    String indexPath = "build/";

    /* ExpansionType.URI->the field to be analyzed (expanded) contains URIs */
    Analyzer skosAnalyzer = new SKOSAnalyzer(indexPath, skosFile, ExpansionType.LABEL);

    /* Define different analyzers for different fields */
    Map<String, Analyzer> analyzerPerField = new HashMap<>();
    analyzerPerField.put("subject", skosAnalyzer);
    PerFieldAnalyzerWrapper indexAnalyzer = new PerFieldAnalyzerWrapper(new SimpleAnalyzer(), analyzerPerField);

    /* setting up a writer with a default (simple) analyzer */
    writer = new IndexWriter(new RAMDirectory(), new IndexWriterConfig(indexAnalyzer));

    /* adding the document to the index */
    writer.addDocument(doc);

    /* defining a query that searches over all fields */
    BooleanQuery.Builder builder = new BooleanQuery.Builder();
    builder.add(new TermQuery(new Term("title", "arms")), BooleanClause.Occur.SHOULD)
            .add(new TermQuery(new Term("description", "arms")), BooleanClause.Occur.SHOULD)
            .add(new TermQuery(new Term("subject", "arms")), BooleanClause.Occur.SHOULD);

    /* creating a new searcher */
    searcher = new IndexSearcher(DirectoryReader.open(writer, false));

    TopDocs results = searcher.search(builder.build(), 10);

    /* the document matches because "arms" is among the expanded terms */
    assertEquals(1, results.totalHits);

    /* defining a query that searches for a broader concept */
    Query query = new TermQuery(new Term("subject", "military equipment"));

    results = searcher.search(query, 10);

    /* ... also returns the document as result */
    assertEquals(1, results.totalHits);
}

From source file:at.ac.univie.mminf.luceneSKOS.test.termexpansion.URIbasedTermExpansionTest.java

License:Apache License

/**
 * This test indexes a sample metadata record (=lucene document) having a
 * "title", "description", and "subject" field, which is semantically
 * enriched by a URI pointing to a SKOS concept "weapons".
 * <p/>/*from  ww  w.ja  v a  2 s .c o m*/
 * A search for "arms" returns that record as a result because "arms" is
 * defined as an alternative label (altLabel) for the concept "weapons".
 *
 * @throws IOException
 */
@Test
public void uriBasedTermExpansion() throws IOException {

    /* defining the document to be indexed */
    Document doc = new Document();
    doc.add(new Field("title", "Spearhead", TextField.TYPE_STORED));
    doc.add(new Field("description",
            "Roman iron spearhead. The spearhead was attached to one end of a wooden shaft..."
                    + "The spear was mainly a thrusting weapon, but could also be thrown. "
                    + "It was the principal weapon of the auxiliary soldier... "
                    + "(second - fourth century, Arbeia Roman Fort).",
            TextField.TYPE_NOT_STORED));
    doc.add(new Field("subject", "http://www.ukat.org.uk/thesaurus/concept/859", TextField.TYPE_NOT_STORED));

    /* setting up the SKOS analyzer */
    String skosFile = "src/test/resources/skos_samples/ukat_examples.n3";
    String indexPath = "build/";

    /* ExpansionType.URI->the field to be analyzed (expanded) contains URIs */
    Analyzer skosAnalyzer = new SKOSAnalyzer(indexPath, skosFile, ExpansionType.URI);

    /* Define different analyzers for different fields */
    Map<String, Analyzer> analyzerPerField = new HashMap<>();
    analyzerPerField.put("subject", skosAnalyzer);
    PerFieldAnalyzerWrapper indexAnalyzer = new PerFieldAnalyzerWrapper(new SimpleAnalyzer(), analyzerPerField);

    /* setting up a writer with a default (simple) analyzer */
    writer = new IndexWriter(new RAMDirectory(), new IndexWriterConfig(indexAnalyzer));

    /* adding the document to the index */
    writer.addDocument(doc);

    /* defining a query that searches over all fields */
    BooleanQuery.Builder builder = new BooleanQuery.Builder();
    builder.add(new TermQuery(new Term("title", "arms")), BooleanClause.Occur.SHOULD)
            .add(new TermQuery(new Term("description", "arms")), BooleanClause.Occur.SHOULD)
            .add(new TermQuery(new Term("subject", "arms")), BooleanClause.Occur.SHOULD);

    /* creating a new searcher */
    searcher = new IndexSearcher(DirectoryReader.open(writer, false));

    TopDocs results = searcher.search(builder.build(), 10);

    /* the document matches because "arms" is among the expanded terms */
    assertEquals(1, results.totalHits);

    /* defining a query that searches for a broader concept */
    Query query = new TermQuery(new Term("subject", "military equipment"));

    results = searcher.search(query, 10);

    /* ... also returns the document as result */
    assertEquals(1, results.totalHits);

}

From source file:BlockBuilding.AbstractBlockBuilding.java

License:Apache License

protected IndexWriter openWriter(Directory directory) {
    try {/*from  w w w.j av  a 2s.  c om*/
        Analyzer analyzer = new SimpleAnalyzer();
        IndexWriterConfig config = new IndexWriterConfig(analyzer);
        return new IndexWriter(directory, config);
    } catch (IOException ex) {
        LOGGER.log(Level.SEVERE, null, ex);
        return null;
    }
}

From source file:com.codenvy.test.lucene.DeleteFilesWithSameName.java

License:Open Source License

public static void main(String[] args) throws Exception {
    String DOC_DIR_NAME = "files";
    filesDirPath = Paths.get(DOC_DIR_NAME).toAbsolutePath().toString();

    Path indexPath = Paths.get("index");
    Path docDir = Paths.get(DOC_DIR_NAME);

    Path file1 = Paths.get(DOC_DIR_NAME, "File1");
    Path file2 = Paths.get(DOC_DIR_NAME, "File1A");

    Analyzer analyzer = new SimpleAnalyzer();
    IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
    iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    //iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);

    if (!Files.isReadable(docDir)) {
        System.out.println("document folder not found");
        return;//from w w w.  jav  a  2s .  c  o m
    }

    Directory index = FSDirectory.open(indexPath);

    IndexWriter writer = new IndexWriter(index, iwc);

    //add files to index
    indexDocs(writer, file1);
    indexDocs(writer, file2);
    writer.commit();

    searchAndPrintResult(indexPath);

    //delete files
    System.out.println();
    System.out.println("==================================================================");
    System.out.println("delete by prefix \"" + filesDirPath + "/File1\"");
    Query query = new PrefixQuery(new Term(PATH, filesDirPath + "/File1"));

    writer.deleteDocuments(query);
    writer.close();

    searchAndPrintResult(indexPath);
}

From source file:com.faqit.similarity.NGramExtractor.java

License:Open Source License

/**
 * Extracts NGrams from a String of text. Can handle ngrams of any length
 * and also perform stop word removal before extraction
 * //  w  w  w.  j  a va2  s  .  c  o m
 * @param text
 *            the text that the ngrams should be extracted from
 * @param length
 *            the length of the ngrams
 * @param stopWords
 *            whether or not stopwords should be removed before extraction
 * @param overlap
 *            whether or not the ngrams should overlap
 */
public void extract(String text, int length, Boolean stopWords, Boolean overlap)
        throws FileNotFoundException, IOException {

    this.text = text;
    this.length = length;
    this.stopWords = stopWords;
    this.overlap = overlap;

    nGrams = new LinkedList<String>();
    uniqueNGrams = new LinkedList<String>();
    nGramFreqs = new HashMap<String, Integer>();

    /*
     * If the minLength and maxLength are both 1, then we want unigrams Make
     * use of a StopAnalyzer when stopwords should be removed Make use of a
     * SimpleAnalyzer when stop words should be included
     */
    if (length == 1) {
        if (this.stopWords) {
            analyzer = new StandardAnalyzer();
        } else {
            analyzer = new SimpleAnalyzer();
        }
    } else { // Bigger than unigrams so use ShingleAnalyzerWrapper. Once
             // again, different analyzers depending on stop word removal
        if (this.stopWords) {
            analyzer = new ShingleAnalyzerWrapper(new StopAnalyzer(), length, length, " ", false, false, ""); // This is a
            // hack to use
            // Lucene 2.4
            // since in 2.4
            // position
            // increments
            // weren't
            // preserved by
            // default.
            // Using a later
            // version puts
            // underscores
            // (_) in the
            // place of
            // removed stop
            // words.
        } else {
            analyzer = new ShingleAnalyzerWrapper(new SimpleAnalyzer(), length, length, " ", false, false, "");
        }
    }

    // Code to process and extract the ngrams
    TokenStream tokenStream = analyzer.tokenStream("text", new StringReader(this.text));
    // OffsetAttribute offsetAttribute =
    // tokenStream.addAttribute(OffsetAttribute.class);
    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);

    // int tokenCount = 0;
    tokenStream.reset();
    while (tokenStream.incrementToken()) {

        // int startOffset = offsetAttribute.startOffset();
        // int endOffset = offsetAttribute.endOffset();
        String termToken = charTermAttribute.toString(); // The actual token
        // term
        nGrams.add(termToken); // Add all ngrams to the ngram LinkedList

        // If n-grams are not allowed to overlap, then increment to point of
        // no overlap
        if (!overlap) {
            for (int i = 0; i < length - 1; i++) {
                tokenStream.incrementToken();
            }
        }

    }

    // Store unique nGrams and frequencies in hash tables
    for (String nGram : nGrams) {
        if (nGramFreqs.containsKey(nGram)) {
            nGramFreqs.put(nGram, nGramFreqs.get(nGram) + 1);
        } else {
            nGramFreqs.put(nGram, 1);
            uniqueNGrams.add(nGram);
        }
    }

}