List of usage examples for org.apache.lucene.analysis.tokenattributes OffsetAttribute startOffset
public int startOffset();
From source file:NGramExtractor.java
License:Open Source License
/** * Extracts NGrams from a String of text. * Can handle ngrams of any length and also perform stop word removal before extraction * @param text the text that the ngrams should be extracted from * @param length the length of the ngrams * @param stopWords whether or not stopwords should be removed before extraction * @param overlap whether or not the ngrams should overlap *///from w w w. j a va 2s. c o m public void extract(String text, int length, Boolean stopWords, Boolean overlap) throws FileNotFoundException, IOException { this.text = text; this.length = length; this.stopWords = stopWords; this.overlap = overlap; nGrams = new LinkedList<String>(); uniqueNGrams = new LinkedList<String>(); nGramFreqs = new HashMap<String, Integer>(); /* If the minLength and maxLength are both 1, then we want unigrams * Make use of a StopAnalyzer when stopwords should be removed * Make use of a SimpleAnalyzer when stop words should be included */ if (length == 1) { if (this.stopWords) { analyzer = new StandardAnalyzer(Version.LUCENE_36); } else { analyzer = new SimpleAnalyzer(Version.LUCENE_36); } } else { //Bigger than unigrams so use ShingleAnalyzerWrapper. Once again, different analyzers depending on stop word removal if (this.stopWords) { analyzer = new ShingleAnalyzerWrapper(new StopAnalyzer(Version.LUCENE_24), length, length, " ", false, false); //This is a hack to use Lucene 2.4 since in 2.4 position increments weren't preserved by default. Using a later version puts underscores (_) in the place of removed stop words. } else { analyzer = new ShingleAnalyzerWrapper(new SimpleAnalyzer(Version.LUCENE_36), length, length, " ", false, false); } } //Code to process and extract the ngrams TokenStream tokenStream = analyzer.tokenStream("text", new StringReader(this.text)); OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); int tokenCount = 0; while (tokenStream.incrementToken()) { int startOffset = offsetAttribute.startOffset(); int endOffset = offsetAttribute.endOffset(); String termToken = charTermAttribute.toString(); //The actual token term nGrams.add(termToken); //Add all ngrams to the ngram LinkedList //If n-grams are not allowed to overlap, then increment to point of no overlap if (!overlap) { for (int i = 0; i < length - 1; i++) { tokenStream.incrementToken(); } } } //Store unique nGrams and frequencies in hash tables for (String nGram : nGrams) { if (nGramFreqs.containsKey(nGram)) { nGramFreqs.put(nGram, nGramFreqs.get(nGram) + 1); } else { nGramFreqs.put(nGram, 1); uniqueNGrams.add(nGram); } } }
From source file:analysis.AnalyzerUtils.java
License:Apache License
public static void displayTokensWithFullDetails(Analyzer analyzer, String text) throws IOException { TokenStream stream = analyzer.tokenStream("contents", // #A new StringReader(text)); TermAttribute term = stream.addAttribute(TermAttribute.class); // #B PositionIncrementAttribute posIncr = // #B stream.addAttribute(PositionIncrementAttribute.class); // #B OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class); // #B TypeAttribute type = stream.addAttribute(TypeAttribute.class); // #B int position = 0; while (stream.incrementToken()) { // #C int increment = posIncr.getPositionIncrement(); // #D if (increment > 0) { // #D position = position + increment; // #D System.out.println(); // #D System.out.print(position + ": "); // #D }/*from ww w. ja va 2 s. co m*/ System.out.print("[" + // #E term.term() + ":" + // #E offset.startOffset() + "->" + // #E offset.endOffset() + ":" + // #E type.type() + "] "); // #E } System.out.println(); }
From source file:analysis.FtpFilePathAnalyzer.java
License:Apache License
public static void main(String[] args) { Analyzer ana = new FtpFilePathAnalyzer(); String test2 = "c++c++"; StringReader reader = new StringReader(test2); TokenStream ts = ana.tokenStream("path", reader); try {//ww w. j a v a2 s .c o m while (ts.incrementToken()) { TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class); OffsetAttribute offsetAtt = (OffsetAttribute) ts.getAttribute(OffsetAttribute.class); PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) ts .getAttribute(PositionIncrementAttribute.class); TypeAttribute typeAtt = (TypeAttribute) ts.getAttribute(TypeAttribute.class); System.out.print("(" + offsetAtt.startOffset() + "," + offsetAtt.endOffset() + ") [" + posIncrAtt.getPositionIncrement() + "," + typeAtt.type() + "] " + "[" + termAtt.term() + "]"); } } catch (IOException e) { e.printStackTrace(); } }
From source file:analyzers.DebugAnalyzer.java
License:Apache License
/** * This method outputs token-by-token analysis of documents. * * @param reader the reader for the documents * @param analyzer the analyzer /*from w ww .j a v a 2s . co m*/ * @throws IOException cannot load stream */ public static void showAnalysisFromStream(Reader reader, Analyzer analyzer) throws IOException { TokenStream stream = analyzer.tokenStream("text", reader); CharTermAttribute cta = stream.addAttribute(CharTermAttribute.class); OffsetAttribute oa = stream.addAttribute(OffsetAttribute.class); TypeAttribute typeAtt = stream.addAttribute(TypeAttribute.class); try { stream.reset(); while (stream.incrementToken()) { // get starting and ending offsets int start = oa.startOffset(); int end = oa.endOffset(); // text of the token String token = cta.toString(); // part of speech tag for the token String tag = typeAtt.type(); System.out.printf("start: %4d\tend: %4d\tlength: %4d\ttag: %s\ttoken: %s\n", start, end, token.length(), tag, token); } } finally { stream.close(); } }
From source file:aos.lucene.analysis.AnalyzerUtils.java
License:Apache License
public static void displayTokensWithFullDetails(Analyzer analyzer, String text) throws IOException { TokenStream stream = analyzer.tokenStream("contents", // #A new StringReader(text)); TermAttribute term = stream.addAttribute(TermAttribute.class); // #B PositionIncrementAttribute posIncr = // #B stream.addAttribute(PositionIncrementAttribute.class); // #B OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class); // #B TypeAttribute type = stream.addAttribute(TypeAttribute.class); // #B int position = 0; while (stream.incrementToken()) { // #C int increment = posIncr.getPositionIncrement(); // #D if (increment > 0) { // #D position = position + increment; // #D LOGGER.info(); // #D System.out.print(position + ": "); // #D }//from ww w . j a va2s . c om System.out.print("[" + // #E term.term() + ":" + // #E offset.startOffset() + "->" + // #E offset.endOffset() + ":" + // #E type.type() + "] "); // #E } LOGGER.info(); }
From source file:at.ac.univie.mminf.luceneSKOS.util.AnalyzerUtils.java
License:Apache License
public static void displayTokensWithFullDetails(Analyzer analyzer, String text) throws IOException { TokenStream stream = analyzer.tokenStream("contents", new StringReader(text)); CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class); OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class); TypeAttribute type = stream.addAttribute(TypeAttribute.class); PayloadAttribute payload = stream.addAttribute(PayloadAttribute.class); int position = 0; while (stream.incrementToken()) { int increment = posIncr.getPositionIncrement(); if (increment > 0) { position = position + increment; System.out.println(); System.out.print(position + ":"); }// w ww. ja v a 2 s . com Payload pl = payload.getPayload(); if (pl != null) { System.out.print("[" + term.toString() + ":" + offset.startOffset() + "->" + offset.endOffset() + ":" + type.type() + ":" + new String(pl.getData()) + "] "); } else { System.out.print("[" + term.toString() + ":" + offset.startOffset() + "->" + offset.endOffset() + ":" + type.type() + "] "); } } System.out.println(); }
From source file:at.tuwien.ifs.somtoolbox.apps.viewer.DocViewPanel.java
License:Apache License
private void updateWeightHighlighting() { // remove previous highlighting removeHighLights(weightingHighLights); if (weightHighlightBox.isSelected()) { if (inputDataObjects.getTemplateVector() == null) { Logger.getLogger("at.tuwien.ifs.somtoolbox").severe( "Template vector file needed for displaying weights. Load from the File->Data files menu"); weightHighlightBox.setSelected(false); return; }/*from ww w.ja va 2s . co m*/ if (inputDataObjects.getInputData() == null) { Logger.getLogger("at.tuwien.ifs.somtoolbox").severe( "Input data file needed for displaying weights. Load from the File->Data files menu"); weightHighlightBox.setSelected(false); return; } SOMLibTemplateVector tv = inputDataObjects.getTemplateVector(); InputData data = inputDataObjects.getInputData(); InputDatum input = data.getInputDatum(currentInput); double maxValue = data.getMaxValue(); double minValue = data.getMinValue(); double span = maxValue - minValue; // init paints Palette p = paletteSelectionPanel.getSelectedPalette(); int paletteLength = p.getNumberOfColours(); weightPaints = new DefaultHighlighter.DefaultHighlightPainter[paletteLength]; for (int i = 0; i < weightPaints.length; i++) { weightPaints[i] = new DefaultHighlighter.DefaultHighlightPainter(p.getColor(i)); } String text = textPane.getText(); StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_30); TokenStream stream = analyzer.tokenStream("contents", new StringReader(text)); try { while (stream.incrementToken()) { TypeAttribute typeAttribute = stream.getAttribute(TypeAttribute.class); if (!at.tuwien.ifs.somtoolbox.util.StringUtils.equalsAny(typeAttribute.type(), "<APOSTROPHE>")) { TermAttribute termAttribute = stream.getAttribute(TermAttribute.class); String term = termAttribute.term(); if (tv.containsLabel(term)) { int index = tv.getIndex(term); double value = input.getVector().getQuick(index); int colorIndex = (int) (paletteLength / 4d + relativeValue(minValue, span, value) * paletteLength / 2d); OffsetAttribute offsetAttribute = stream.getAttribute(OffsetAttribute.class); offsetAttribute.startOffset(); Object tag = highlighter.addHighlight(offsetAttribute.startOffset(), offsetAttribute.endOffset(), weightPaints[colorIndex]); weightingHighLights.add(tag); } } } } catch (IOException e) { e.printStackTrace(); } catch (BadLocationException e) { e.printStackTrace(); } } }
From source file:biospectra.classify.Classifier.java
License:Apache License
private void createChainProximityQueryClauses(BooleanQuery.Builder builder, String field, CachingTokenFilter stream, TermToBytesRefAttribute termAtt, OffsetAttribute offsetAtt) throws IOException { Term termArr[] = new Term[2]; long offsetArr[] = new long[2]; for (int i = 0; i < 2; i++) { termArr[i] = null;// w ww . ja v a2s . c om offsetArr[i] = 0; } while (stream.incrementToken()) { Term t = new Term(field, BytesRef.deepCopyOf(termAtt.getBytesRef())); if (termArr[0] == null) { termArr[0] = t; offsetArr[0] = offsetAtt.startOffset(); } else if (termArr[1] == null) { termArr[1] = t; offsetArr[1] = offsetAtt.startOffset(); } else { // shift termArr[0] = termArr[1]; offsetArr[0] = offsetArr[1]; // fill termArr[1] = t; offsetArr[1] = offsetAtt.startOffset(); } if (termArr[0] != null && termArr[1] != null) { long offsetDiff = offsetArr[1] - offsetArr[0]; if (offsetDiff > 0) { PhraseQuery.Builder pq = new PhraseQuery.Builder(); pq.setSlop((int) (offsetDiff) + 1); pq.add(termArr[0]); pq.add(termArr[1]); builder.add(pq.build(), BooleanClause.Occur.SHOULD); } } } }
From source file:biospectra.classify.Classifier.java
License:Apache License
private void createPairedProximityQueryClauses(BooleanQuery.Builder builder, String field, CachingTokenFilter stream, TermToBytesRefAttribute termAtt, OffsetAttribute offsetAtt) throws IOException { Term termArr[] = new Term[2]; long offsetArr[] = new long[2]; for (int i = 0; i < 2; i++) { termArr[i] = null;//from ww w. jav a 2 s. c o m offsetArr[i] = 0; } int count = 0; while (stream.incrementToken()) { Term t = new Term(field, BytesRef.deepCopyOf(termAtt.getBytesRef())); if (count % 2 == 0) { termArr[0] = t; offsetArr[0] = offsetAtt.startOffset(); } else { termArr[1] = t; offsetArr[1] = offsetAtt.startOffset(); long offsetDiff = offsetArr[1] - offsetArr[0]; if (offsetDiff > 0) { PhraseQuery.Builder pq = new PhraseQuery.Builder(); pq.setSlop((int) (offsetDiff) + 1); pq.add(termArr[0]); pq.add(termArr[1]); builder.add(pq.build(), BooleanClause.Occur.SHOULD); } termArr[0] = null; termArr[1] = null; } count++; } if (termArr[0] != null) { builder.add(new TermQuery(termArr[0]), BooleanClause.Occur.SHOULD); termArr[0] = null; } }
From source file:br.bireme.ngrams.Tools.java
public static void showTokens(final Analyzer analyzer, final String fieldName, final String text) throws IOException { TokenStream tokenStream = analyzer.tokenStream(fieldName, text); OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset();/*from ww w . j ava2 s . c o m*/ while (tokenStream.incrementToken()) { int startOffset = offsetAttribute.startOffset(); int endOffset = offsetAttribute.endOffset(); final String term = charTermAttribute.toString(); System.out.println(term + " [" + startOffset + "," + endOffset + "]"); } }