List of usage examples for org.apache.lucene.analysis TokenStream close
@Override public void close() throws IOException
From source file:org.grouplens.samantha.modeler.featurizer.FeatureExtractorUtilities.java
License:Open Source License
static public Map<String, Integer> getTermFreq(Analyzer analyzer, String text, String termField) { TokenStream ts = analyzer.tokenStream(termField, text); Map<String, Integer> termFreq = new HashMap<>(); try {/* ww w. j a v a2 s .c om*/ ts.reset(); while (ts.incrementToken()) { String term = ts.reflectAsString(false); int cnt = termFreq.getOrDefault(term, 0); termFreq.put(term, cnt + 1); } ts.close(); } catch (IOException e) { logger.error("{}", e.getMessage()); throw new BadRequestException(e); } return termFreq; }
From source file:org.hbasene.index.HBaseIndexWriter.java
License:Apache License
public void addDocument(Document doc, Analyzer analyzer) throws CorruptIndexException, IOException { String docId = doc.get(this.primaryKeyField); if (docId == null) { throw new IllegalArgumentException( "Primary Key " + this.primaryKeyField + " not present in the document to be added "); // TODO: Special type of exception needed ? }/*from w ww . j a v a 2 s . c o m*/ int position = 0; Map<String, List<Integer>> termPositions = new HashMap<String, List<Integer>>(); Map<String, byte[]> fieldsToStore = new HashMap<String, byte[]>(); for (Fieldable field : doc.getFields()) { // Indexed field if (field.isIndexed() && field.isTokenized()) { TokenStream tokens = field.tokenStreamValue(); if (tokens == null) { tokens = analyzer.tokenStream(field.name(), new StringReader(field.stringValue())); } tokens.addAttribute(TermAttribute.class); tokens.addAttribute(PositionIncrementAttribute.class); // collect term frequencies per doc if (position > 0) { position += analyzer.getPositionIncrementGap(field.name()); } // Build the termPositions vector for all terms while (tokens.incrementToken()) { String term = createColumnName(field.name(), tokens.getAttribute(TermAttribute.class).term()); List<Integer> pvec = termPositions.get(term); if (pvec == null) { pvec = Lists.newArrayList(); termPositions.put(term, pvec); } position += (tokens.getAttribute(PositionIncrementAttribute.class).getPositionIncrement() - 1); pvec.add(++position); } tokens.close(); } // Untokenized fields go in without a termPosition if (field.isIndexed() && !field.isTokenized()) { String term = this.createColumnName(field.name(), field.stringValue()); String key = term; termPositions.put(key, EMPTY_TERM_POSITIONS); } // Stores each field as a column under this doc key if (field.isStored()) { byte[] value = field.isBinary() ? field.getBinaryValue() : Bytes.toBytes(field.stringValue()); // first byte flags if binary or not final byte[] prefix = Bytes.toBytes((field.isBinary() ? 'B' : 'T')); fieldsToStore.put(field.name(), Bytes.add(prefix, value)); } } indexStore.indexDocument(docId, new DocumentIndexContext(termPositions, fieldsToStore)); termPositions.clear(); fieldsToStore.clear(); }
From source file:org.hibernate.search.backend.lucene.util.impl.AnalyzerUtils.java
License:LGPL
/** * Returns the first token resulting from the analysis, logging a warning if there are more than one token. * * @param analyzer the Lucene analyzer to use * @param fieldName the name of the field: might affect the analyzer behavior * @param text the value to analyze/*from ww w .ja v a 2s .c o m*/ * @return the first token resulting from the analysis * * @throws SearchException if a problem occurs when analyzing the sortable field's value. */ public static String normalize(Analyzer analyzer, String fieldName, String text) { final TokenStream stream = analyzer.tokenStream(fieldName, new StringReader(text)); try { try { String firstToken = null; CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); stream.reset(); if (stream.incrementToken()) { firstToken = new String(term.buffer(), 0, term.length()); if (stream.incrementToken()) { log.multipleTermsDetectedDuringNormalization(fieldName); } else { stream.end(); } } return firstToken; } finally { stream.close(); } } catch (SearchException | IOException e) { throw log.couldNotNormalizeField(fieldName, e); } }
From source file:org.hibernate.search.backend.spi.SingularTermDeletionQuery.java
License:LGPL
@Override public Query toLuceneQuery(DocumentBuilderIndexedEntity documentBuilder) { AnalyzerReference analyzerReferenceForEntity = documentBuilder.getAnalyzerReference(); String stringValue = documentBuilder.objectToString(fieldName, this.getValue(), new ContextualExceptionBridgeHelper()); if (this.getType() == Type.STRING) { try {/*from w w w . jav a 2 s .co m*/ if (analyzerReferenceForEntity.is(RemoteAnalyzerReference.class)) { // no need to take into account the analyzer here as it will be dealt with remotely return new TermQuery(new Term(this.getFieldName(), stringValue)); } ScopedLuceneAnalyzer analyzerForEntity = (ScopedLuceneAnalyzer) analyzerReferenceForEntity .unwrap(LuceneAnalyzerReference.class).getAnalyzer(); TokenStream tokenStream = analyzerForEntity.tokenStream(this.getFieldName(), stringValue); tokenStream.reset(); try { BooleanQuery.Builder booleanQueryBuilder = new BooleanQuery.Builder(); while (tokenStream.incrementToken()) { String term = tokenStream.getAttribute(CharTermAttribute.class).toString(); booleanQueryBuilder.add(new TermQuery(new Term(this.getFieldName(), term)), Occur.FILTER); } return booleanQueryBuilder.build(); } finally { tokenStream.close(); } } catch (IOException e) { throw new AssertionFailure( "No IOException can occur while using a TokenStream that is generated via String"); } } else { FieldBridge fieldBridge = documentBuilder.getBridge(fieldName); if (NumericFieldUtils.isNumericFieldBridge(fieldBridge)) { return NumericFieldUtils.createExactMatchQuery(fieldName, this.getValue()); } else { return new TermQuery(new Term(this.getFieldName(), stringValue)); } } }
From source file:org.hibernate.search.query.dsl.impl.ConnectedMultiFieldsPhraseQueryBuilder.java
License:Open Source License
public Query createQuery(FieldContext fieldContext) { final Query perFieldQuery; final String fieldName = fieldContext.getField(); /*/*from ww w . ja v a2s . c o m*/ * Store terms per position and detect if for a given position more than one term is present */ TokenStream stream = null; boolean isMultiPhrase = false; Map<Integer, List<Term>> termsPerPosition = new HashMap<Integer, List<Term>>(); final String sentence = phraseContext.getSentence(); try { Reader reader = new StringReader(sentence); stream = queryContext.getQueryAnalyzer().reusableTokenStream(fieldName, reader); TermAttribute termAttribute = stream.addAttribute(TermAttribute.class); PositionIncrementAttribute positionAttribute = stream.addAttribute(PositionIncrementAttribute.class); stream.reset(); int position = -1; //start at -1 since we apply at least one increment List<Term> termsAtSamePosition = null; while (stream.incrementToken()) { int positionIncrement = 1; if (positionAttribute != null) { positionIncrement = positionAttribute.getPositionIncrement(); } if (positionIncrement > 0) { position += positionIncrement; termsAtSamePosition = termsPerPosition.get(position); } if (termsAtSamePosition == null) { termsAtSamePosition = new ArrayList<Term>(); termsPerPosition.put(position, termsAtSamePosition); } termsAtSamePosition.add(new Term(fieldName, termAttribute.term())); if (termsAtSamePosition.size() > 1) { isMultiPhrase = true; } } } catch (IOException e) { throw new AssertionFailure("IOException while reading a string. Doh!", e); } finally { if (stream != null) { try { stream.end(); stream.close(); } catch (IOException e) { throw new AssertionFailure("IOException while reading a string. Doh!", e); } } } /* * Create the appropriate query depending on the conditions * note that a MultiPhraseQuery is needed if several terms share the same position * as it will do a OR and not a AND like PhraseQuery */ final int size = termsPerPosition.size(); if (size == 0) { throw new SearchException( "phrase query returns no term. Is there a problem with your analyzers? " + sentence); } if (size == 1) { final List<Term> terms = termsPerPosition.values().iterator().next(); if (terms.size() == 1) { perFieldQuery = new TermQuery(terms.get(0)); } else { BooleanQuery query = new BooleanQuery(); for (Term term : terms) { query.add(new TermQuery(term), BooleanClause.Occur.SHOULD); } perFieldQuery = query; } } else { if (isMultiPhrase) { MultiPhraseQuery query = new MultiPhraseQuery(); query.setSlop(phraseContext.getSlop()); for (Map.Entry<Integer, List<Term>> entry : termsPerPosition.entrySet()) { final List<Term> value = entry.getValue(); query.add(value.toArray(new Term[value.size()]), entry.getKey()); } perFieldQuery = query; } else { PhraseQuery query = new PhraseQuery(); query.setSlop(phraseContext.getSlop()); for (Map.Entry<Integer, List<Term>> entry : termsPerPosition.entrySet()) { final List<Term> value = entry.getValue(); query.add(value.get(0), entry.getKey()); } perFieldQuery = query; } } return fieldContext.getFieldCustomizer().setWrappedQuery(perFieldQuery).createQuery(); }
From source file:org.hibernate.search.query.dsl.impl.Helper.java
License:Open Source License
static List<String> getAllTermsFromText(String fieldName, String localText, Analyzer analyzer) throws IOException { List<String> terms = new ArrayList<String>(); // Can't deal with null at this point. Likely returned by some FieldBridge not recognizing the type. if (localText == null) { throw new SearchException("Search parameter on field " + fieldName + " could not be converted. " + "Are the parameter and the field of the same type?" + "Alternatively, apply the ignoreFieldBridge() option to " + "pass String parameters"); }//from w w w. j a va2 s .c o m Reader reader = new StringReader(localText); TokenStream stream = analyzer.reusableTokenStream(fieldName, reader); TermAttribute attribute = stream.addAttribute(TermAttribute.class); stream.reset(); while (stream.incrementToken()) { if (attribute.termLength() > 0) { String term = attribute.term(); terms.add(term); } } stream.end(); stream.close(); return terms; }
From source file:org.hibernate.search.util.impl.InternalAnalyzerUtils.java
License:LGPL
/** * Returns the first token resulting from the analysis, logging a warning if there are more than one token. * * @param analyzer the Lucene analyzer to use * @param fieldName the name of the field: might affect the analyzer behavior * @param text the value to analyze/*from w w w .j a v a 2 s. c om*/ * @return the first token resulting from the analysis * * @throws SearchException if a problem occurs when analyzing the sortable field's value. */ public static String analyzeSortableValue(Analyzer analyzer, String fieldName, String text) { final TokenStream stream = analyzer.tokenStream(fieldName, new StringReader(text)); try { try { String firstToken = null; CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); stream.reset(); if (stream.incrementToken()) { firstToken = new String(term.buffer(), 0, term.length()); if (stream.incrementToken()) { log.multipleTermsInAnalyzedSortableField(fieldName); } else { stream.end(); } } return firstToken; } finally { stream.close(); } } catch (SearchException | IOException e) { throw log.couldNotAnalyzeSortableField(fieldName, e); } }
From source file:org.ihtsdo.otf.query.lucene.LuceneIndexer.java
License:Apache License
protected Query buildPrefixQuery(String searchString, String field, Analyzer analyzer) throws IOException { StringReader textReader = new StringReader(searchString); TokenStream tokenStream = analyzer.tokenStream(field, textReader); tokenStream.reset();//from w w w .j av a 2 s . c o m List<String> terms = new ArrayList<>(); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); while (tokenStream.incrementToken()) { terms.add(charTermAttribute.toString()); } textReader.close(); tokenStream.close(); analyzer.close(); BooleanQuery bq = new BooleanQuery(); if (terms.size() > 0 && !searchString.endsWith(" ")) { String last = terms.remove(terms.size() - 1); bq.add(new PrefixQuery((new Term(field, last))), Occur.MUST); } terms.stream().forEach((s) -> { bq.add(new TermQuery(new Term(field, s)), Occur.MUST); }); return bq; }
From source file:org.index.Tag.java
private String getBagOfWords(String text) throws Exception { StringBuffer buff = new StringBuffer(); text = Question.removeTags(text);/*from ww w . j ava2s. com*/ boolean toStem = Boolean.parseBoolean(prop.getProperty("stem", "true")); String stopFile = prop.getProperty("stopfile"); Analyzer analyzer = new SOAnalyzer(toStem, stopFile); TokenStream stream = analyzer.tokenStream("bow", new StringReader(text)); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { String term = termAtt.toString(); buff.append(term).append(" "); } stream.end(); stream.close(); return buff.toString(); }
From source file:org.index.TermScore.java
private List<String> getBagOfWords(String text) throws Exception { List<String> terms = new ArrayList<>(); text = Question.removeTags(text);//from w w w. j a va2 s . c om boolean toStem = Boolean.parseBoolean(prop.getProperty("stem", "true")); String stopFile = prop.getProperty("stopfile"); Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_4_9); /*SOAnalyzer(toStem, stopFile)*/; TokenStream stream = analyzer.tokenStream("bow", new StringReader(text)); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { String term = termAtt.toString(); terms.add(term); } stream.end(); stream.close(); return terms; }