List of usage examples for org.apache.lucene.analysis TokenStream close
@Override public void close() throws IOException
From source file:at.itbh.bev.apibeans.FinderImpl.java
License:Open Source License
public FullTextQuery constructQuery(EntityManager em, String postalCode, String place, String addressLine, String houseId) throws InvalidApiUsageException { FullTextEntityManager fullTextEm = Search.getFullTextEntityManager(em); if ((Objects.toString(postalCode, "") + Objects.toString(place, "") + Objects.toString(addressLine, "") + Objects.toString(houseId, "")).length() == 0) { throw new InvalidApiUsageException( "At least one parameter must be provided. Coordinates don't count as parameters."); }/*from w w w . ja va 2s.co m*/ if (addressLine != null && addressLine.length() < 2 && addressLine.length() > 0) { throw new InvalidApiUsageException("The parameter addressLine must consist of at least 2 characters."); } QueryBuilder b = fullTextEm.getSearchFactory().buildQueryBuilder().forEntity(AdresseDenormalized.class) .get(); List<Query> queries = new ArrayList<>(); if (postalCode != null && postalCode.length() > 0) { queries.add(b.keyword().onField("postalCode").boostedTo(20).matching(postalCode).createQuery()); } if (addressLine != null && addressLine.length() > 0) { queries.add(b.keyword().onField("addressLine").matching(addressLine + addressLine + addressLine) .createQuery()); // triple addressLine since in the data source it is also tripled if // there is no building or address name queries.add(b.keyword().onField("addressLineExact").boostedTo(10) .matching(addressLine + addressLine + addressLine).createQuery()); } if (houseId != null && houseId.length() > 0) { // if search string contains a number, take the first number in the // search string and match with the house number Matcher matcher = housenumberPattern.matcher(houseId); if (matcher.find()) { queries.add( b.keyword().onField("hausnrzahl").boostedTo(50).matching(matcher.group(1)).createQuery()); } if (houseId.matches(".*\\D.*")) { queries.add(b.keyword().onField("houseIdExact").matching(houseId).createQuery()); } queries.add(b.keyword().onField("houseId").boostedTo(20).matching(houseId).createQuery()); TextAnalyzer analyzer = new TextAnalyzer(); TokenStream stream; try { stream = analyzer.tokenStream(null, new StringReader(houseId)); // CharTermAttribute cattr = stream.addAttribute(CharTermAttribute.class); stream.reset(); if (stream.incrementToken()) { // if analyzer does not remove everything check hofname and hausnrgebaeudebez queries.add(b.keyword().onField("hofname").matching(houseId).createQuery()); queries.add(b.keyword().onField("hausnrgebaeudebez").matching(houseId).createQuery()); // System.out.println(cattr.toString()); } stream.end(); stream.close(); } catch (IOException e1) { e1.printStackTrace(); } analyzer.close(); } if (place != null && place.length() > 0) { queries.add(b.keyword().onField("place").matching(place).createQuery()); queries.add(b.keyword().onField("municipalityExact").boostedTo(20).matching(place).createQuery()); queries.add(b.keyword().onField("placeExact").boostedTo(5).matching(place).createQuery()); } @SuppressWarnings("rawtypes") BooleanJunction bq = b.bool(); for (Query item : queries) { bq = bq.should(item); } FullTextQuery fullTextQuery = fullTextEm.createFullTextQuery(bq.createQuery(), AdresseDenormalized.class); return fullTextQuery; }
From source file:be.ugent.tiwi.sleroux.newsrec.newsreclib.newsFetch.storm.bolts.TweetAnalyzerBolt.java
License:Apache License
@Override public void execute(Tuple input) { try {//w w w . ja va2 s. c o m String tweet = (String) input.getValueByField(StreamIDs.TWEET); Reader reader = new StringReader(tweet); LanguageIdentifier identifier = new LanguageIdentifier(tweet); NewsRecLuceneAnalyzer analyzer = LanguageAnalyzerHelper.getInstance() .getAnalyzer(new Locale(identifier.getLanguage())); TokenStream tokenStream = analyzer.tokenStream("", reader); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { String term = charTermAttribute.toString(); collector.emit(StreamIDs.TERMSTREAM, new Values(term)); } reader.close(); tokenStream.close(); for (String term : extractNames(tweet, analyzer.getStopwords())) { collector.emit(StreamIDs.TERMSTREAM, new Values(term)); } } catch (IOException ex) { logger.error(ex); } }
From source file:bixo.examples.webmining.PhraseShingleAnalyzer.java
License:Apache License
public List<String> getTermList(String contentText) { List<String> result = new ArrayList<String>(contentText.length() / 10); try {/*w w w. ja v a 2s.co m*/ TokenStream stream = _analyzer.tokenStream("content", new StringReader(contentText)); CharTermAttribute termAtt = (CharTermAttribute) stream.addAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { if (termAtt.length() > 0) { String term = termAtt.toString(); result.add(term); } } stream.end(); stream.close(); } catch (IOException e) { throw new RuntimeException("Impossible error", e); } return result; }
From source file:br.ufmt.harmonizacao.implementer.PatenteeSearcher.java
public List<String> search(String field, String value) { try {// w ww . j a v a2 s . c o m long start = System.currentTimeMillis(); TokenStream stream = analyzer.tokenStream(field, new StringReader(value)); CharTermAttribute attr = stream.getAttribute(CharTermAttribute.class); stream.reset(); String valor = ""; while (stream.incrementToken()) { valor = valor + attr.toString() + ' '; } BooleanQuery bq = new BooleanQuery(); BooleanQuery acronymBq = null; String query = ""; BooleanQuery wrapBq = new BooleanQuery(); String[] tokens = valor.split(" "); for (int i = 0; i < tokens.length; i++) { if (tokens.length >= 2) { acronymBq = new BooleanQuery(); switch (i) { case 0: acronymBq.add(new PrefixQuery(new Term(field, tokens[i])), BooleanClause.Occur.MUST); bq.add(new PrefixQuery(new Term(field, tokens[i])), BooleanClause.Occur.SHOULD); break; case 1: acronymBq.add(new FuzzyQuery(new Term(field, tokens[i])), BooleanClause.Occur.MUST_NOT); bq.add(new FuzzyQuery(new Term(field, tokens[i])), BooleanClause.Occur.SHOULD); bq.add(new LengthQuery(field, valor), BooleanClause.Occur.MUST_NOT); break; default: break; } } else { if (tokens[i].length() > 3) { bq.add(new FuzzyQuery(new Term(field, tokens[i])), BooleanClause.Occur.MUST); } else { bq.add(new TermQuery(new Term(field, tokens[i])), BooleanClause.Occur.MUST); } } } stream.end(); stream.close(); // Aqui termina // Cria uma fuzzyquery, ela que far a busca de aproximao wrapBq.add(bq, BooleanClause.Occur.MUST); if (acronymBq != null) { //new QueryParser(Version.LUCENE_47, field, new StandardAnalyzer(Version.LUCENE_47)).parse(query) wrapBq.add(acronymBq, BooleanClause.Occur.MUST_NOT); } String queryTime = "Tempo para construo da query : " + (System.currentTimeMillis() - start) + "ms"; // Pegando os documentos encontrado na pesquisa start = System.currentTimeMillis(); ScoreDoc[] hits = searcher.search(wrapBq, 10).scoreDocs; String searchTime = "Tempo para busca : " + (System.currentTimeMillis() - start) + "ms"; List<String> result = new ArrayList<String>(); result.add(valor); if (hits.length > 0) { for (int i = 0; i < hits.length; i++) { Document hitDoc = searcher.doc(hits[i].doc); result.add(hitDoc.get(field)); } } result.add(queryTime); result.add(searchTime); return result; } catch (IOException ex) { Logger.getLogger(PatenteeSearcher.class.getName()).log(Level.SEVERE, null, ex); } return null; }
From source file:cl.usach.escalemania.sessionbeans.DocumentoFacade.java
public List<String> tokenizeString(Analyzer analyzer, String tweet) { List<String> result = new ArrayList<String>(); try {//from w ww .j a va 2s . c o m TokenStream stream = analyzer.tokenStream(null, new StringReader(tweet)); stream.reset(); while (stream.incrementToken()) { result.add(stream.getAttribute(CharTermAttribute.class).toString()); } stream.close(); } catch (IOException e) { throw new RuntimeException(e); } return result; }
From source file:cn.edu.thss.iise.beehivez.server.index.labelindex.LabelLuceneIndex.java
License:Open Source License
public boolean contain(String label) { try {/* w w w . java 2 s.co m*/ IndexReader reader = IndexReader.open(this.indexDir, true); Searcher searcher = new IndexSearcher(reader); // use the boolean query HashSet<String> queryTermSet = new HashSet<String>(); TokenStream stream = analyzer.tokenStream(LabelDocument.FIELD_LABEL, new StringReader(label)); TermAttribute termAtt = stream.addAttribute(TermAttribute.class); stream.reset(); while (stream.incrementToken()) { queryTermSet.add(termAtt.term()); } stream.end(); stream.close(); // construct the query BooleanQuery bq = new BooleanQuery(); Iterator<String> it = queryTermSet.iterator(); while (it.hasNext()) { String s = it.next(); Term term = new Term(LabelDocument.FIELD_LABEL, s); TermQuery termQuery = new TermQuery(term); bq.add(termQuery, Occur.MUST); } ExactLabelQueryResultCollector collector = new ExactLabelQueryResultCollector(reader, label); searcher.search(bq, collector); boolean ret = collector.isExistQueryLabel(); reader.close(); return ret; } catch (Exception e) { e.printStackTrace(); } return false; }
From source file:cn.edu.thss.iise.beehivez.server.index.labelindex.LabelLuceneIndex.java
License:Open Source License
public TreeSet<SimilarLabelQueryResult> getSimilarLabels(String query, float similarity) { TreeSet<SimilarLabelQueryResult> ret = new TreeSet<SimilarLabelQueryResult>(); if (query == null) { ret.add(new SimilarLabelQueryResult(null, 1)); return ret; }//from www .j a v a 2 s .co m try { IndexReader reader = IndexReader.open(this.indexDir, true); Searcher searcher = new IndexSearcher(reader); // get terms from query HashSet<String> queryTermSet = new HashSet<String>(); TokenStream stream = analyzer.tokenStream(LabelDocument.FIELD_LABEL, new StringReader(query)); TermAttribute termAtt = stream.addAttribute(TermAttribute.class); stream.reset(); while (stream.incrementToken()) { queryTermSet.add(termAtt.term()); } stream.end(); stream.close(); // construct the query BooleanQuery bq = new BooleanQuery(); Iterator<String> it = queryTermSet.iterator(); SynonymMap synMap = SynonymIndex.getSynonymMap(); HashSet<String> expandedQueryTermSet = new HashSet<String>(queryTermSet); while (it.hasNext()) { String s = it.next(); Term term = new Term(LabelDocument.FIELD_LABEL, s); TermQuery termQuery = new TermQuery(term); bq.add(termQuery, Occur.SHOULD); // expand using synonyms for (String syn : synMap.getSynonyms(s)) { stemer.setCurrent(syn); stemer.stem(); syn = stemer.getCurrent(); if (expandedQueryTermSet.add(syn)) { term = new Term(LabelDocument.FIELD_LABEL, syn); termQuery = new TermQuery(term); bq.add(termQuery, Occur.SHOULD); } } } // search in the label index SimilarLabelQueryResultCollector collector = new SimilarLabelQueryResultCollector(reader, queryTermSet, similarity); searcher.search(bq, collector); ret = collector.getQueryResult(); searcher.close(); reader.close(); } catch (Exception e) { e.printStackTrace(); } return ret; }
From source file:cn.edu.thss.iise.beehivez.server.index.luceneindex.analyzer.SemicolonAnalyzer.java
License:Open Source License
/** * @param args/*w w w . j ava 2s. c o m*/ */ public static void main(String[] args) throws IOException { // text to tokenize final String text = "This is a demo of , the new TokenStream API"; SemicolonAnalyzer analyzer = new SemicolonAnalyzer(); TokenStream stream = analyzer.tokenStream("field", new StringReader(text)); // get the TermAttribute from the TokenStream TermAttribute termAtt = stream.addAttribute(TermAttribute.class); stream.reset(); // print all tokens until stream is exhausted while (stream.incrementToken()) { System.out.println(termAtt.term()); } stream.end(); stream.close(); }
From source file:cn.edu.thss.iise.beehivez.server.util.StringSimilarityUtil.java
License:Open Source License
/** * tokenize the given string, all the words are extracted, lowercased, all * the stop words are removed, and all the words are replaced with their * stem//w w w .ja v a 2 s. c om * * @param label * @return */ public static HashSet<String> snowballTokenize(String label) { HashSet<String> ret = new HashSet<String>(); try { Analyzer analyzer = new SnowballAnalyzer(Version.LUCENE_CURRENT, "English", StandardAnalyzer.STOP_WORDS_SET); TokenStream stream = analyzer.tokenStream(LabelDocument.FIELD_LABEL, new StringReader(label)); TermAttribute termAtt = stream.addAttribute(TermAttribute.class); stream.reset(); while (stream.incrementToken()) { ret.add(termAtt.term()); } stream.end(); stream.close(); } catch (Exception e) { e.printStackTrace(); } return ret; }
From source file:com.billiger.solr.handler.component.QLTBComponent.java
License:Apache License
/** * Get analyzed version of the query string. * * This uses the analyzer for the configured FieldType for this * component to analyze and re-assemble the original query string. * If no queryFieldType is configured, the original query will be * returned.//w w w .j a v a 2 s.co m * * This is used both in the prepare() stage of the component and * when reading the QLTB map data. */ String getAnalyzedQuery(String query) throws IOException { if (analyzer == null) { return query; } StringBuilder norm = new StringBuilder(); TokenStream tokens = analyzer.tokenStream("", new StringReader(query)); tokens.reset(); CharTermAttribute termAtt = tokens.addAttribute(CharTermAttribute.class); while (tokens.incrementToken()) { norm.append(termAtt.buffer(), 0, termAtt.length()); } tokens.end(); tokens.close(); return norm.toString(); }