List of usage examples for org.apache.lucene.analysis.core SimpleAnalyzer SimpleAnalyzer
public SimpleAnalyzer()
From source file:at.ac.univie.mminf.luceneSKOS.analysis.engine.jena.SKOSEngineImpl.java
License:Apache License
/** * This constructor loads the SKOS model from a given InputStream using the * given serialization language parameter, which must be either N3, RDF/XML, * or TURTLE./*w w w . j av a2s. com*/ * * @param inputStream the input stream * @param lang the serialization language * @throws IOException if the model cannot be loaded */ public SKOSEngineImpl(InputStream inputStream, String lang) throws IOException { if (!("N3".equals(lang) || "RDF/XML".equals(lang) || "TURTLE".equals(lang))) { throw new IOException("Invalid RDF serialization format"); } this.analyzer = new SimpleAnalyzer(); this.skosModel = ModelFactory.createDefaultModel(); skosModel.read(inputStream, null, lang); indexDir = new RAMDirectory(); entailSKOSModel(); indexSKOSModel(); searcher = new IndexSearcher(DirectoryReader.open(indexDir)); }
From source file:at.ac.univie.mminf.luceneSKOS.analysis.engine.jena.SKOSEngineImpl.java
License:Apache License
/** * This constructor loads the SKOS model from a given filename or URI, * starts the indexing process and sets up the index searcher. * * @param indexPath index path/*from w ww . j a va 2 s.c o m*/ * @param filenameOrURI file name or URI * @param languages the languages to be considered * @throws IOException if indexing SKOS model fails */ public SKOSEngineImpl(String indexPath, String filenameOrURI, List<String> languages) throws IOException { this.analyzer = new SimpleAnalyzer(); String langSig = ""; if (languages != null) { this.languages = new TreeSet<>(languages); if (!this.languages.isEmpty()) { langSig = "-" + join(this.languages.iterator(), '-'); } } String name = getName(filenameOrURI); File dir = new File(indexPath + name + langSig); this.indexDir = FSDirectory.open(dir.toPath()); if (filenameOrURI != null) { FileManager fileManager = new FileManager(); fileManager.addLocatorFile(); fileManager.addLocatorURL(); fileManager.addLocatorClassLoader(SKOSEngineImpl.class.getClassLoader()); if (getExtension(filenameOrURI).equals("zip")) { fileManager.addLocatorZip(filenameOrURI); filenameOrURI = getBaseName(filenameOrURI); } File inputFile = new File(filenameOrURI); Path inputPath = Paths.get(inputFile.getParent(), inputFile.getName()); skosModel = fileManager.loadModel(inputPath.toUri().toString()); entailSKOSModel(); indexSKOSModel(); searcher = new IndexSearcher(DirectoryReader.open(indexDir)); } }
From source file:at.ac.univie.mminf.luceneSKOS.analysis.engine.jena.SKOSEngineImpl.java
License:Apache License
/** * This constructor loads the SKOS model from a given InputStream using the * given serialization language parameter, which must be either N3, RDF/XML, * or TURTLE./*from w w w.j av a2 s . co m*/ * * @param inputStream the input stream * @param format the serialization language * @param languages the languages * @throws IOException if the model cannot be loaded */ public SKOSEngineImpl(InputStream inputStream, String format, List<String> languages) throws IOException { if (!("N3".equals(format) || "RDF/XML".equals(format) || "TURTLE".equals(format))) { throw new IOException("Invalid RDF serialization format"); } if (languages != null) { this.languages = new TreeSet<>(languages); } analyzer = new SimpleAnalyzer(); skosModel = ModelFactory.createDefaultModel(); skosModel.read(inputStream, null, format); indexDir = new RAMDirectory(); entailSKOSModel(); indexSKOSModel(); searcher = new IndexSearcher(DirectoryReader.open(indexDir)); }
From source file:at.ac.univie.mminf.luceneSKOS.test.SKOSLabelFilterTest.java
License:Apache License
@Test public void testTermQuery() throws IOException, QueryNodeException { Document doc = new Document(); doc.add(new Field("content", "I work for the united nations", TextField.TYPE_STORED)); writer.addDocument(doc);/*from w w w.j a v a 2s. c o m*/ searcher = new IndexSearcher(DirectoryReader.open(writer, false)); StandardQueryParser parser = new StandardQueryParser(new SimpleAnalyzer()); Query query = parser.parse("united nations", "content"); assertEquals(1, searcher.search(query, 1).totalHits); }
From source file:at.ac.univie.mminf.luceneSKOS.test.termexpansion.AbstractTermExpansionTest.java
License:Apache License
/** * This test indexes a sample metadata record (=lucene document) having a * "title", "description", and "subject" field, which contains plain subject * terms./*from w w w. ja v a 2 s .c om*/ * <p/> * A search for "arms" doesn't return that record because the term "arms" is * not explicitly contained in the record (document). * * @throws IOException * @throws LockObtainFailedException * @throws CorruptIndexException */ @Test public void noExpansion() throws IOException { /* defining the document to be indexed */ Document doc = new Document(); doc.add(new Field("title", "Spearhead", TextField.TYPE_STORED)); doc.add(new Field("description", "Roman iron spearhead. The spearhead was attached to one end of a wooden shaft..." + "The spear was mainly a thrusting weapon, but could also be thrown. " + "It was the principal weapon of the auxiliary soldier... " + "(second - fourth century, Arbeia Roman Fort).", TextField.TYPE_NOT_STORED)); doc.add(new Field("subject", "weapons", TextField.TYPE_NOT_STORED)); /* setting up a writer with a default (simple) analyzer */ writer = new IndexWriter(new RAMDirectory(), new IndexWriterConfig(new SimpleAnalyzer())); /* adding the document to the index */ writer.addDocument(doc); /* defining a query that searches over all fields */ BooleanQuery.Builder builder = new BooleanQuery.Builder(); builder.add(new TermQuery(new Term("title", "arms")), BooleanClause.Occur.SHOULD) .add(new TermQuery(new Term("description", "arms")), BooleanClause.Occur.SHOULD) .add(new TermQuery(new Term("subject", "arms")), BooleanClause.Occur.SHOULD); /* creating a new searcher */ searcher = new IndexSearcher(DirectoryReader.open(writer, false)); TopDocs results = searcher.search(builder.build(), 10); /* no results are returned since there is no term match */ assertEquals(0, results.totalHits); }
From source file:at.ac.univie.mminf.luceneSKOS.test.termexpansion.LabelbasedTermExpansionTest.java
License:Apache License
/** * This test indexes a sample metadata record (=lucene document) having a * "title", "description", and "subject" field. * <p/>/*from www . ja v a 2s . c o m*/ * A search for "arms" returns that record as a result because "arms" is * defined as an alternative label for "weapons", the term which is * contained in the subject field. * * @throws IOException */ @Test public void labelBasedTermExpansion() throws IOException { /* defining the document to be indexed */ Document doc = new Document(); doc.add(new Field("title", "Spearhead", TextField.TYPE_STORED)); doc.add(new Field("description", "Roman iron spearhead. The spearhead was attached to one end of a wooden shaft..." + "The spear was mainly a thrusting weapon, but could also be thrown. " + "It was the principal weapon of the auxiliary soldier... " + "(second - fourth century, Arbeia Roman Fort).", TextField.TYPE_NOT_STORED)); doc.add(new Field("subject", "weapons", TextField.TYPE_NOT_STORED)); /* setting up the SKOS analyzer */ String skosFile = "src/test/resources/skos_samples/ukat_examples.n3"; String indexPath = "build/"; /* ExpansionType.URI->the field to be analyzed (expanded) contains URIs */ Analyzer skosAnalyzer = new SKOSAnalyzer(indexPath, skosFile, ExpansionType.LABEL); /* Define different analyzers for different fields */ Map<String, Analyzer> analyzerPerField = new HashMap<>(); analyzerPerField.put("subject", skosAnalyzer); PerFieldAnalyzerWrapper indexAnalyzer = new PerFieldAnalyzerWrapper(new SimpleAnalyzer(), analyzerPerField); /* setting up a writer with a default (simple) analyzer */ writer = new IndexWriter(new RAMDirectory(), new IndexWriterConfig(indexAnalyzer)); /* adding the document to the index */ writer.addDocument(doc); /* defining a query that searches over all fields */ BooleanQuery.Builder builder = new BooleanQuery.Builder(); builder.add(new TermQuery(new Term("title", "arms")), BooleanClause.Occur.SHOULD) .add(new TermQuery(new Term("description", "arms")), BooleanClause.Occur.SHOULD) .add(new TermQuery(new Term("subject", "arms")), BooleanClause.Occur.SHOULD); /* creating a new searcher */ searcher = new IndexSearcher(DirectoryReader.open(writer, false)); TopDocs results = searcher.search(builder.build(), 10); /* the document matches because "arms" is among the expanded terms */ assertEquals(1, results.totalHits); /* defining a query that searches for a broader concept */ Query query = new TermQuery(new Term("subject", "military equipment")); results = searcher.search(query, 10); /* ... also returns the document as result */ assertEquals(1, results.totalHits); }
From source file:at.ac.univie.mminf.luceneSKOS.test.termexpansion.URIbasedTermExpansionTest.java
License:Apache License
/** * This test indexes a sample metadata record (=lucene document) having a * "title", "description", and "subject" field, which is semantically * enriched by a URI pointing to a SKOS concept "weapons". * <p/>/*from ww w.ja v a 2 s .c o m*/ * A search for "arms" returns that record as a result because "arms" is * defined as an alternative label (altLabel) for the concept "weapons". * * @throws IOException */ @Test public void uriBasedTermExpansion() throws IOException { /* defining the document to be indexed */ Document doc = new Document(); doc.add(new Field("title", "Spearhead", TextField.TYPE_STORED)); doc.add(new Field("description", "Roman iron spearhead. The spearhead was attached to one end of a wooden shaft..." + "The spear was mainly a thrusting weapon, but could also be thrown. " + "It was the principal weapon of the auxiliary soldier... " + "(second - fourth century, Arbeia Roman Fort).", TextField.TYPE_NOT_STORED)); doc.add(new Field("subject", "http://www.ukat.org.uk/thesaurus/concept/859", TextField.TYPE_NOT_STORED)); /* setting up the SKOS analyzer */ String skosFile = "src/test/resources/skos_samples/ukat_examples.n3"; String indexPath = "build/"; /* ExpansionType.URI->the field to be analyzed (expanded) contains URIs */ Analyzer skosAnalyzer = new SKOSAnalyzer(indexPath, skosFile, ExpansionType.URI); /* Define different analyzers for different fields */ Map<String, Analyzer> analyzerPerField = new HashMap<>(); analyzerPerField.put("subject", skosAnalyzer); PerFieldAnalyzerWrapper indexAnalyzer = new PerFieldAnalyzerWrapper(new SimpleAnalyzer(), analyzerPerField); /* setting up a writer with a default (simple) analyzer */ writer = new IndexWriter(new RAMDirectory(), new IndexWriterConfig(indexAnalyzer)); /* adding the document to the index */ writer.addDocument(doc); /* defining a query that searches over all fields */ BooleanQuery.Builder builder = new BooleanQuery.Builder(); builder.add(new TermQuery(new Term("title", "arms")), BooleanClause.Occur.SHOULD) .add(new TermQuery(new Term("description", "arms")), BooleanClause.Occur.SHOULD) .add(new TermQuery(new Term("subject", "arms")), BooleanClause.Occur.SHOULD); /* creating a new searcher */ searcher = new IndexSearcher(DirectoryReader.open(writer, false)); TopDocs results = searcher.search(builder.build(), 10); /* the document matches because "arms" is among the expanded terms */ assertEquals(1, results.totalHits); /* defining a query that searches for a broader concept */ Query query = new TermQuery(new Term("subject", "military equipment")); results = searcher.search(query, 10); /* ... also returns the document as result */ assertEquals(1, results.totalHits); }
From source file:BlockBuilding.AbstractBlockBuilding.java
License:Apache License
protected IndexWriter openWriter(Directory directory) { try {/*from w w w.j av a 2s. c om*/ Analyzer analyzer = new SimpleAnalyzer(); IndexWriterConfig config = new IndexWriterConfig(analyzer); return new IndexWriter(directory, config); } catch (IOException ex) { LOGGER.log(Level.SEVERE, null, ex); return null; } }
From source file:com.codenvy.test.lucene.DeleteFilesWithSameName.java
License:Open Source License
public static void main(String[] args) throws Exception { String DOC_DIR_NAME = "files"; filesDirPath = Paths.get(DOC_DIR_NAME).toAbsolutePath().toString(); Path indexPath = Paths.get("index"); Path docDir = Paths.get(DOC_DIR_NAME); Path file1 = Paths.get(DOC_DIR_NAME, "File1"); Path file2 = Paths.get(DOC_DIR_NAME, "File1A"); Analyzer analyzer = new SimpleAnalyzer(); IndexWriterConfig iwc = new IndexWriterConfig(analyzer); iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); //iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); if (!Files.isReadable(docDir)) { System.out.println("document folder not found"); return;//from w w w. jav a 2s . c o m } Directory index = FSDirectory.open(indexPath); IndexWriter writer = new IndexWriter(index, iwc); //add files to index indexDocs(writer, file1); indexDocs(writer, file2); writer.commit(); searchAndPrintResult(indexPath); //delete files System.out.println(); System.out.println("=================================================================="); System.out.println("delete by prefix \"" + filesDirPath + "/File1\""); Query query = new PrefixQuery(new Term(PATH, filesDirPath + "/File1")); writer.deleteDocuments(query); writer.close(); searchAndPrintResult(indexPath); }
From source file:com.faqit.similarity.NGramExtractor.java
License:Open Source License
/** * Extracts NGrams from a String of text. Can handle ngrams of any length * and also perform stop word removal before extraction * // w w w. j a va2 s . c o m * @param text * the text that the ngrams should be extracted from * @param length * the length of the ngrams * @param stopWords * whether or not stopwords should be removed before extraction * @param overlap * whether or not the ngrams should overlap */ public void extract(String text, int length, Boolean stopWords, Boolean overlap) throws FileNotFoundException, IOException { this.text = text; this.length = length; this.stopWords = stopWords; this.overlap = overlap; nGrams = new LinkedList<String>(); uniqueNGrams = new LinkedList<String>(); nGramFreqs = new HashMap<String, Integer>(); /* * If the minLength and maxLength are both 1, then we want unigrams Make * use of a StopAnalyzer when stopwords should be removed Make use of a * SimpleAnalyzer when stop words should be included */ if (length == 1) { if (this.stopWords) { analyzer = new StandardAnalyzer(); } else { analyzer = new SimpleAnalyzer(); } } else { // Bigger than unigrams so use ShingleAnalyzerWrapper. Once // again, different analyzers depending on stop word removal if (this.stopWords) { analyzer = new ShingleAnalyzerWrapper(new StopAnalyzer(), length, length, " ", false, false, ""); // This is a // hack to use // Lucene 2.4 // since in 2.4 // position // increments // weren't // preserved by // default. // Using a later // version puts // underscores // (_) in the // place of // removed stop // words. } else { analyzer = new ShingleAnalyzerWrapper(new SimpleAnalyzer(), length, length, " ", false, false, ""); } } // Code to process and extract the ngrams TokenStream tokenStream = analyzer.tokenStream("text", new StringReader(this.text)); // OffsetAttribute offsetAttribute = // tokenStream.addAttribute(OffsetAttribute.class); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); // int tokenCount = 0; tokenStream.reset(); while (tokenStream.incrementToken()) { // int startOffset = offsetAttribute.startOffset(); // int endOffset = offsetAttribute.endOffset(); String termToken = charTermAttribute.toString(); // The actual token // term nGrams.add(termToken); // Add all ngrams to the ngram LinkedList // If n-grams are not allowed to overlap, then increment to point of // no overlap if (!overlap) { for (int i = 0; i < length - 1; i++) { tokenStream.incrementToken(); } } } // Store unique nGrams and frequencies in hash tables for (String nGram : nGrams) { if (nGramFreqs.containsKey(nGram)) { nGramFreqs.put(nGram, nGramFreqs.get(nGram) + 1); } else { nGramFreqs.put(nGram, 1); uniqueNGrams.add(nGram); } } }