List of usage examples for org.apache.lucene.analysis TokenStream end
public void end() throws IOException
false
(using the new TokenStream
API). From source file:org.alfresco.solr.query.Solr4QueryParser.java
License:Open Source License
private ArrayList<String> getTokens(IndexableField indexableField) throws IOException { ArrayList<String> tokens = new ArrayList<String>(); TokenStream ts = indexableField.tokenStream(schema.getIndexAnalyzer(), null); CharTermAttribute termAttribute = ts.getAttribute(CharTermAttribute.class); ts.reset();/*from w w w.j a v a2s.c o m*/ while (ts.incrementToken()) { String token = new String(termAttribute.buffer(), 0, termAttribute.length()); tokens.add(token); } ts.end(); ts.close(); return tokens; }
From source file:org.alfresco.solr.SolrInformationServer.java
License:Open Source License
private void addContentPropertyToDocUsingAlfrescoRepository(SolrInputDocument doc, QName propertyQName, long dbId, String locale) throws AuthenticationException, IOException { long start = System.nanoTime(); // Expensive call to be done with ContentTracker GetTextContentResponse response = repositoryClient.getTextContent(dbId, propertyQName, null); addContentPropertyMetadata(doc, propertyQName, AlfrescoSolrDataModel.ContentFieldType.TRANSFORMATION_STATUS, response);// w ww. j a v a2s. co m addContentPropertyMetadata(doc, propertyQName, AlfrescoSolrDataModel.ContentFieldType.TRANSFORMATION_EXCEPTION, response); addContentPropertyMetadata(doc, propertyQName, AlfrescoSolrDataModel.ContentFieldType.TRANSFORMATION_TIME, response); InputStream ris = response.getContent(); String textContent = ""; try { if (ris != null) { // Get and copy content byte[] bytes = FileCopyUtils.copyToByteArray(new BoundedInputStream(ris, contentStreamLimit)); textContent = new String(bytes, StandardCharsets.UTF_8); } } finally { // release the response only when the content has been read response.release(); } if (minHash && textContent.length() > 0) { Analyzer analyzer = core.getLatestSchema().getFieldType("min_hash").getIndexAnalyzer(); TokenStream ts = analyzer.tokenStream("min_hash", textContent); CharTermAttribute termAttribute = ts.getAttribute(CharTermAttribute.class); ts.reset(); while (ts.incrementToken()) { StringBuilder tokenBuff = new StringBuilder(); char[] buff = termAttribute.buffer(); for (int i = 0; i < termAttribute.length(); i++) { tokenBuff.append(Integer.toHexString(buff[i])); } doc.addField(FINGERPRINT_FIELD, tokenBuff.toString()); } ts.end(); ts.close(); } long end = System.nanoTime(); this.getTrackerStats().addDocTransformationTime(end - start); StringBuilder builder = new StringBuilder(textContent.length() + 16); builder.append("\u0000").append(locale).append("\u0000"); builder.append(textContent); String localisedText = builder.toString(); for (FieldInstance field : AlfrescoSolrDataModel.getInstance() .getIndexedFieldNamesForProperty(propertyQName).getFields()) { doc.removeField(field.getField()); if (field.isLocalised()) { doc.addField(field.getField(), localisedText); } else { doc.addField(field.getField(), textContent); } addFieldIfNotSet(doc, field); } }
From source file:org.allenai.blacklab.queryParser.lucene.QueryParserBase.java
License:Apache License
protected BytesRef analyzeMultitermTerm(String field, String part, Analyzer analyzerIn) { TokenStream source; if (analyzerIn == null) analyzerIn = analyzer;//ww w . j a v a2s . c o m try { source = analyzerIn.tokenStream(field, new StringReader(part)); source.reset(); } catch (IOException e) { throw new RuntimeException("Unable to initialize TokenStream to analyze multiTerm term: " + part, e); } TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class); BytesRef bytes = termAtt.getBytesRef(); try { if (!source.incrementToken()) throw new IllegalArgumentException("analyzer returned no terms for multiTerm term: " + part); termAtt.fillBytesRef(); if (source.incrementToken()) throw new IllegalArgumentException("analyzer returned too many terms for multiTerm term: " + part); } catch (IOException e) { throw new RuntimeException("error analyzing range part: " + part, e); } try { source.end(); source.close(); } catch (IOException e) { throw new RuntimeException("Unable to end & close TokenStream after analyzing multiTerm term: " + part, e); } return BytesRef.deepCopyOf(bytes); }
From source file:org.apache.jackrabbit.core.query.lucene.AbstractExcerpt.java
License:Apache License
/** * @param text the text./*from w w w .j a v a 2s. c o m*/ * @return a <code>TermPositionVector</code> for the given text. */ private TermPositionVector createTermPositionVector(String text) { // term -> TermVectorOffsetInfo[] final SortedMap<String, TermVectorOffsetInfo[]> termMap = new TreeMap<String, TermVectorOffsetInfo[]>(); Reader r = new StringReader(text); TokenStream ts = index.getTextAnalyzer().tokenStream("", r); try { while (ts.incrementToken()) { OffsetAttribute offset = ts.getAttribute(OffsetAttribute.class); TermAttribute term = ts.getAttribute(TermAttribute.class); String termText = term.term(); TermVectorOffsetInfo[] info = termMap.get(termText); if (info == null) { info = new TermVectorOffsetInfo[1]; } else { TermVectorOffsetInfo[] tmp = info; info = new TermVectorOffsetInfo[tmp.length + 1]; System.arraycopy(tmp, 0, info, 0, tmp.length); } info[info.length - 1] = new TermVectorOffsetInfo(offset.startOffset(), offset.endOffset()); termMap.put(termText, info); } ts.end(); ts.close(); } catch (IOException e) { // should never happen, we are reading from a string } return new TermPositionVector() { private String[] terms = (String[]) termMap.keySet().toArray(new String[termMap.size()]); public int[] getTermPositions(int index) { return null; } public TermVectorOffsetInfo[] getOffsets(int index) { TermVectorOffsetInfo[] info = TermVectorOffsetInfo.EMPTY_OFFSET_INFO; if (index >= 0 && index < terms.length) { info = termMap.get(terms[index]); } return info; } public String getField() { return ""; } public int size() { return terms.length; } public String[] getTerms() { return terms; } public int[] getTermFrequencies() { int[] freqs = new int[terms.length]; for (int i = 0; i < terms.length; i++) { freqs[i] = termMap.get(terms[i]).length; } return freqs; } public int indexOf(String term) { int res = Arrays.binarySearch(terms, term); return res >= 0 ? res : -1; } public int[] indexesOf(String[] terms, int start, int len) { int[] res = new int[len]; for (int i = 0; i < len; i++) { res[i] = indexOf(terms[i]); } return res; } }; }
From source file:org.apache.jackrabbit.core.query.lucene.JackrabbitQueryParser.java
License:Apache License
/** * {@inheritDoc}//from w w w . j a va 2 s .co m */ protected Query getPrefixQuery(String field, String termStr) throws ParseException { // only create a prefix query when the term is a single word / token Analyzer a = getAnalyzer(); TokenStream ts = a.tokenStream(field, new StringReader(termStr)); int count = 0; boolean isCJ = false; try { TypeAttribute t = ts.addAttribute(TypeAttribute.class); ts.reset(); while (ts.incrementToken()) { count++; isCJ = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.CJ].equals(t.type()); } ts.end(); } catch (IOException e) { throw new ParseException(e.getMessage()); } finally { try { ts.close(); } catch (IOException e) { // ignore } } if (count > 1 && isCJ) { return getFieldQuery(field, termStr); } else { return getWildcardQuery(field, termStr + "*"); } }
From source file:org.apache.jackrabbit.core.query.lucene.MoreLikeThis.java
License:Apache License
/** * Adds term frequencies found by tokenizing text from reader into the Map words * @param r a source of text to be tokenized * @param termFreqMap a Map of terms and their frequencies * @param fieldName Used by analyzer for any special per-field analysis *//*from ww w.j a va2 s. c o m*/ private void addTermFrequencies(Reader r, Map<String, Int> termFreqMap, String fieldName) throws IOException { TokenStream ts = analyzer.tokenStream(fieldName, r); int tokenCount = 0; // for every token while (ts.incrementToken()) { TermAttribute term = ts.getAttribute(TermAttribute.class); String word = term.term(); tokenCount++; if (tokenCount > maxNumTokensParsed) { break; } if (isNoiseWord(word)) { continue; } // increment frequency Int cnt = termFreqMap.get(word); if (cnt == null) { termFreqMap.put(word, new Int()); } else { cnt.x++; } } ts.end(); ts.close(); }
From source file:org.apache.jackrabbit.core.query.lucene.SearchIndex.java
License:Apache License
/** * Merges the fulltext indexed fields of the aggregated node states into * <code>doc</code>./*from w ww . j a v a 2 s. c om*/ * * @param state the node state on which <code>doc</code> was created. * @param doc the lucene document with index fields from <code>state</code>. * @param ifv the current index format version. */ protected void mergeAggregatedNodeIndexes(NodeState state, Document doc, IndexFormatVersion ifv) { if (indexingConfig != null) { AggregateRule[] aggregateRules = indexingConfig.getAggregateRules(); if (aggregateRules == null) { return; } try { ItemStateManager ism = getContext().getItemStateManager(); for (AggregateRule aggregateRule : aggregateRules) { boolean ruleMatched = false; // node includes NodeState[] aggregates = aggregateRule.getAggregatedNodeStates(state); if (aggregates != null) { ruleMatched = true; for (NodeState aggregate : aggregates) { Document aDoc = createDocument(aggregate, getNamespaceMappings(), ifv); // transfer fields to doc if there are any Fieldable[] fulltextFields = aDoc.getFieldables(FieldNames.FULLTEXT); if (fulltextFields != null) { for (Fieldable fulltextField : fulltextFields) { doc.add(fulltextField); } doc.add(new Field(FieldNames.AGGREGATED_NODE_UUID, false, aggregate.getNodeId().toString(), Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS, Field.TermVector.NO)); } } // make sure that fulltext fields are aligned properly // first all stored fields, then remaining Fieldable[] fulltextFields = doc.getFieldables(FieldNames.FULLTEXT); doc.removeFields(FieldNames.FULLTEXT); Arrays.sort(fulltextFields, FIELDS_COMPARATOR_STORED); for (Fieldable f : fulltextFields) { doc.add(f); } } // property includes PropertyState[] propStates = aggregateRule.getAggregatedPropertyStates(state); if (propStates != null) { ruleMatched = true; for (PropertyState propState : propStates) { String namePrefix = FieldNames.createNamedValue( getNamespaceMappings().translateName(propState.getName()), ""); NodeState parent = (NodeState) ism.getItemState(propState.getParentId()); Document aDoc = createDocument(parent, getNamespaceMappings(), ifv); try { // find the right fields to transfer Fieldable[] fields = aDoc.getFieldables(FieldNames.PROPERTIES); for (Fieldable field : fields) { // assume properties fields use SingleTokenStream TokenStream tokenStream = field.tokenStreamValue(); TermAttribute termAttribute = tokenStream.addAttribute(TermAttribute.class); PayloadAttribute payloadAttribute = tokenStream .addAttribute(PayloadAttribute.class); tokenStream.incrementToken(); tokenStream.end(); tokenStream.close(); String value = new String(termAttribute.termBuffer(), 0, termAttribute.termLength()); if (value.startsWith(namePrefix)) { // extract value String rawValue = value.substring(namePrefix.length()); // create new named value Path p = getRelativePath(state, propState); String path = getNamespaceMappings().translatePath(p); value = FieldNames.createNamedValue(path, rawValue); termAttribute.setTermBuffer(value); PropertyMetaData pdm = PropertyMetaData .fromByteArray(payloadAttribute.getPayload().getData()); doc.add(new Field(field.name(), new SingletonTokenStream(value, pdm.getPropertyType()))); doc.add(new Field(FieldNames.AGGREGATED_NODE_UUID, false, parent.getNodeId().toString(), Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS, Field.TermVector.NO)); if (pdm.getPropertyType() == PropertyType.STRING) { // add to fulltext index Field ft = new Field(FieldNames.FULLTEXT, false, rawValue, Field.Store.YES, Field.Index.ANALYZED_NO_NORMS, Field.TermVector.NO); doc.add(ft); } } } } finally { Util.disposeDocument(aDoc); } } } // only use first aggregate definition that matches if (ruleMatched) { break; } } } catch (NoSuchItemStateException e) { // do not fail if aggregate cannot be created log.info("Exception while building indexing aggregate for {}. Node is not available {}.", state.getNodeId(), e.getMessage()); } catch (Exception e) { // do not fail if aggregate cannot be created log.warn("Exception while building indexing aggregate for " + state.getNodeId(), e); } } }
From source file:org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndex.java
License:Apache License
/** * Tries to merge back tokens that are split on relevant fulltext query * wildcards ('*' or '?')//from ww w . j a v a 2 s .c o m * * * @param text * @param analyzer * @return */ static List<String> tokenize(String text, Analyzer analyzer) { List<String> tokens = new ArrayList<String>(); TokenStream stream = null; try { stream = analyzer.tokenStream(FieldNames.FULLTEXT, new StringReader(text)); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class); // TypeAttribute type = stream.addAttribute(TypeAttribute.class); stream.reset(); int poz = 0; boolean hasFulltextToken = false; StringBuilder token = new StringBuilder(); while (stream.incrementToken()) { String term = termAtt.toString(); int start = offsetAtt.startOffset(); int end = offsetAtt.endOffset(); if (start > poz) { for (int i = poz; i < start; i++) { for (char c : fulltextTokens) { if (c == text.charAt(i)) { token.append(c); hasFulltextToken = true; } } } } poz = end; if (hasFulltextToken) { token.append(term); hasFulltextToken = false; } else { if (token.length() > 0) { tokens.add(token.toString()); } token = new StringBuilder(); token.append(term); } } // consume to the end of the string if (poz < text.length()) { for (int i = poz; i < text.length(); i++) { for (char c : fulltextTokens) { if (c == text.charAt(i)) { token.append(c); } } } } if (token.length() > 0) { tokens.add(token.toString()); } stream.end(); } catch (IOException e) { LOG.error("Building fulltext query failed", e.getMessage()); return null; } finally { try { if (stream != null) { stream.close(); } } catch (IOException e) { // ignore } } return tokens; }
From source file:org.apache.mahout.classifier.NewsgroupHelper.java
License:Apache License
public static void countWords(Analyzer analyzer, Collection<String> words, Reader in, Multiset<String> overallCounts) throws IOException { TokenStream ts = analyzer.tokenStream("text", in); ts.addAttribute(CharTermAttribute.class); ts.reset();/*from w w w . j a va2s. c om*/ while (ts.incrementToken()) { String s = ts.getAttribute(CharTermAttribute.class).toString(); words.add(s); } overallCounts.addAll(words); ts.end(); Closeables.close(ts, true); }
From source file:org.apache.mahout.text.MailArchivesClusteringAnalyzerTest.java
License:Apache License
@Test public void testAnalysis() throws Exception { Analyzer analyzer = new MailArchivesClusteringAnalyzer(); String text = "A test message\n" + "atokenthatistoolongtobeusefulforclustertextanalysis\n" + "Mahout is a scalable, machine-learning LIBRARY\n" + "we've added some additional stopwords such as html, mailto, regards\t" + "apache_hadoop provides the foundation for scalability\n" + "www.nabble.com general-help@incubator.apache.org\n" + "public void int protected package"; Reader reader = new StringReader(text); // if you change the text above, then you may need to change this as well // order matters too String[] expectedTokens = { "test", "mahout", "scalabl", "machin", "learn", "librari", "weve", "ad", "stopword", "apache_hadoop", "provid", "foundat", "scalabl" }; TokenStream tokenStream = analyzer.tokenStream("test", reader); assertNotNull(tokenStream);//from ww w. ja va 2 s . c o m tokenStream.reset(); CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class); int e = 0; while (tokenStream.incrementToken() && e < expectedTokens.length) { assertEquals(expectedTokens[e++], termAtt.toString()); } assertEquals(e, expectedTokens.length); tokenStream.end(); tokenStream.close(); }