List of usage examples for org.apache.lucene.analysis.tokenattributes TypeAttribute type
public String type();
From source file:analysis.AnalyzerUtils.java
License:Apache License
public static String getType(AttributeSource source) { TypeAttribute attr = source.addAttribute(TypeAttribute.class); return attr.type(); }
From source file:analysis.AnalyzerUtils.java
License:Apache License
public static void displayTokensWithFullDetails(Analyzer analyzer, String text) throws IOException { TokenStream stream = analyzer.tokenStream("contents", // #A new StringReader(text)); TermAttribute term = stream.addAttribute(TermAttribute.class); // #B PositionIncrementAttribute posIncr = // #B stream.addAttribute(PositionIncrementAttribute.class); // #B OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class); // #B TypeAttribute type = stream.addAttribute(TypeAttribute.class); // #B int position = 0; while (stream.incrementToken()) { // #C int increment = posIncr.getPositionIncrement(); // #D if (increment > 0) { // #D position = position + increment; // #D System.out.println(); // #D System.out.print(position + ": "); // #D }// ww w . ja v a 2 s .c o m System.out.print("[" + // #E term.term() + ":" + // #E offset.startOffset() + "->" + // #E offset.endOffset() + ":" + // #E type.type() + "] "); // #E } System.out.println(); }
From source file:analysis.FtpFilePathAnalyzer.java
License:Apache License
public static void main(String[] args) { Analyzer ana = new FtpFilePathAnalyzer(); String test2 = "c++c++"; StringReader reader = new StringReader(test2); TokenStream ts = ana.tokenStream("path", reader); try {/*w w w . j a v a 2 s . c o m*/ while (ts.incrementToken()) { TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class); OffsetAttribute offsetAtt = (OffsetAttribute) ts.getAttribute(OffsetAttribute.class); PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) ts .getAttribute(PositionIncrementAttribute.class); TypeAttribute typeAtt = (TypeAttribute) ts.getAttribute(TypeAttribute.class); System.out.print("(" + offsetAtt.startOffset() + "," + offsetAtt.endOffset() + ") [" + posIncrAtt.getPositionIncrement() + "," + typeAtt.type() + "] " + "[" + termAtt.term() + "]"); } } catch (IOException e) { e.printStackTrace(); } }
From source file:analyzers.DebugAnalyzer.java
License:Apache License
/** * This method outputs token-by-token analysis of documents. * * @param reader the reader for the documents * @param analyzer the analyzer /*from w ww . java2s . com*/ * @throws IOException cannot load stream */ public static void showAnalysisFromStream(Reader reader, Analyzer analyzer) throws IOException { TokenStream stream = analyzer.tokenStream("text", reader); CharTermAttribute cta = stream.addAttribute(CharTermAttribute.class); OffsetAttribute oa = stream.addAttribute(OffsetAttribute.class); TypeAttribute typeAtt = stream.addAttribute(TypeAttribute.class); try { stream.reset(); while (stream.incrementToken()) { // get starting and ending offsets int start = oa.startOffset(); int end = oa.endOffset(); // text of the token String token = cta.toString(); // part of speech tag for the token String tag = typeAtt.type(); System.out.printf("start: %4d\tend: %4d\tlength: %4d\ttag: %s\ttoken: %s\n", start, end, token.length(), tag, token); } } finally { stream.close(); } }
From source file:aos.lucene.analysis.AnalyzerUtils.java
License:Apache License
public static void displayTokensWithFullDetails(Analyzer analyzer, String text) throws IOException { TokenStream stream = analyzer.tokenStream("contents", // #A new StringReader(text)); TermAttribute term = stream.addAttribute(TermAttribute.class); // #B PositionIncrementAttribute posIncr = // #B stream.addAttribute(PositionIncrementAttribute.class); // #B OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class); // #B TypeAttribute type = stream.addAttribute(TypeAttribute.class); // #B int position = 0; while (stream.incrementToken()) { // #C int increment = posIncr.getPositionIncrement(); // #D if (increment > 0) { // #D position = position + increment; // #D LOGGER.info(); // #D System.out.print(position + ": "); // #D }//from w w w . j a va 2 s . c o m System.out.print("[" + // #E term.term() + ":" + // #E offset.startOffset() + "->" + // #E offset.endOffset() + ":" + // #E type.type() + "] "); // #E } LOGGER.info(); }
From source file:at.ac.univie.mminf.luceneSKOS.util.AnalyzerUtils.java
License:Apache License
public static void displayTokensWithFullDetails(Analyzer analyzer, String text) throws IOException { TokenStream stream = analyzer.tokenStream("contents", new StringReader(text)); CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class); OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class); TypeAttribute type = stream.addAttribute(TypeAttribute.class); PayloadAttribute payload = stream.addAttribute(PayloadAttribute.class); int position = 0; while (stream.incrementToken()) { int increment = posIncr.getPositionIncrement(); if (increment > 0) { position = position + increment; System.out.println(); System.out.print(position + ":"); }/* ww w . j ava2 s . com*/ Payload pl = payload.getPayload(); if (pl != null) { System.out.print("[" + term.toString() + ":" + offset.startOffset() + "->" + offset.endOffset() + ":" + type.type() + ":" + new String(pl.getData()) + "] "); } else { System.out.print("[" + term.toString() + ":" + offset.startOffset() + "->" + offset.endOffset() + ":" + type.type() + "] "); } } System.out.println(); }
From source file:at.tuwien.ifs.somtoolbox.apps.viewer.DocViewPanel.java
License:Apache License
private void updateWeightHighlighting() { // remove previous highlighting removeHighLights(weightingHighLights); if (weightHighlightBox.isSelected()) { if (inputDataObjects.getTemplateVector() == null) { Logger.getLogger("at.tuwien.ifs.somtoolbox").severe( "Template vector file needed for displaying weights. Load from the File->Data files menu"); weightHighlightBox.setSelected(false); return; }/*w w w.j a va 2 s . c o m*/ if (inputDataObjects.getInputData() == null) { Logger.getLogger("at.tuwien.ifs.somtoolbox").severe( "Input data file needed for displaying weights. Load from the File->Data files menu"); weightHighlightBox.setSelected(false); return; } SOMLibTemplateVector tv = inputDataObjects.getTemplateVector(); InputData data = inputDataObjects.getInputData(); InputDatum input = data.getInputDatum(currentInput); double maxValue = data.getMaxValue(); double minValue = data.getMinValue(); double span = maxValue - minValue; // init paints Palette p = paletteSelectionPanel.getSelectedPalette(); int paletteLength = p.getNumberOfColours(); weightPaints = new DefaultHighlighter.DefaultHighlightPainter[paletteLength]; for (int i = 0; i < weightPaints.length; i++) { weightPaints[i] = new DefaultHighlighter.DefaultHighlightPainter(p.getColor(i)); } String text = textPane.getText(); StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_30); TokenStream stream = analyzer.tokenStream("contents", new StringReader(text)); try { while (stream.incrementToken()) { TypeAttribute typeAttribute = stream.getAttribute(TypeAttribute.class); if (!at.tuwien.ifs.somtoolbox.util.StringUtils.equalsAny(typeAttribute.type(), "<APOSTROPHE>")) { TermAttribute termAttribute = stream.getAttribute(TermAttribute.class); String term = termAttribute.term(); if (tv.containsLabel(term)) { int index = tv.getIndex(term); double value = input.getVector().getQuick(index); int colorIndex = (int) (paletteLength / 4d + relativeValue(minValue, span, value) * paletteLength / 2d); OffsetAttribute offsetAttribute = stream.getAttribute(OffsetAttribute.class); offsetAttribute.startOffset(); Object tag = highlighter.addHighlight(offsetAttribute.startOffset(), offsetAttribute.endOffset(), weightPaints[colorIndex]); weightingHighLights.add(tag); } } } } catch (IOException e) { e.printStackTrace(); } catch (BadLocationException e) { e.printStackTrace(); } } }
From source file:com.antsdb.saltedfish.sql.vdm.LuceneUtil.java
License:Open Source License
static void tokenize(String text, BiConsumer<String, String> lambda) { try (StandardAnalyzer analyzer = new StandardAnalyzer()) { TokenStream stream = analyzer.tokenStream("", text); CharTermAttribute term = stream.getAttribute(CharTermAttribute.class); TypeAttribute type = stream.getAttribute(TypeAttribute.class); stream.reset();/*from w ww .j a v a2 s. co m*/ while (stream.incrementToken()) { lambda.accept(type.type(), term.toString()); } } catch (IOException x) { throw new RuntimeException(x); } }
From source file:com.github.bibreen.mecab_ko_lucene_analyzer.MeCabKoStandardTokenizerTest.java
License:Apache License
private String tokenizerToString(Tokenizer tokenizer) throws Exception { OffsetAttribute extOffset = tokenizer.addAttribute(OffsetAttribute.class); PositionIncrementAttribute posIncrAtt = tokenizer.addAttribute(PositionIncrementAttribute.class); PositionLengthAttribute posLengthAtt = tokenizer.addAttribute(PositionLengthAttribute.class); CharTermAttribute term = (CharTermAttribute) tokenizer.addAttribute(CharTermAttribute.class); TypeAttribute type = (TypeAttribute) tokenizer.addAttribute(TypeAttribute.class); SemanticClassAttribute semanticClass = (SemanticClassAttribute) tokenizer .addAttribute(SemanticClassAttribute.class); PartOfSpeechAttribute pos = (PartOfSpeechAttribute) tokenizer.addAttribute(PartOfSpeechAttribute.class); StringBuilder result = new StringBuilder(); while (tokenizer.incrementToken() == true) { result.append(new String(term.buffer(), 0, term.length())).append(":"); result.append(type.type()).append(":"); result.append(pos.partOfSpeech()).append(":"); result.append(semanticClass.semanticClass()).append(":"); result.append(String.valueOf(posIncrAtt.getPositionIncrement())).append(":"); result.append(String.valueOf(posLengthAtt.getPositionLength())).append(":"); result.append(String.valueOf(extOffset.startOffset())).append(":"); result.append(String.valueOf(extOffset.endOffset())); result.append(","); }/*from w w w . ja va 2s . c om*/ tokenizer.end(); return result.toString(); }
From source file:com.github.healthonnet.search.SynonymExpandingExtendedDismaxQParserPlugin.java
License:Apache License
/** * Given the synonymAnalyzer, returns a list of all alternate queries expanded from the original user query. * //ww w . j a v a 2 s .c om * @param synonymAnalyzer * @param solrParams * @return */ private List<Query> generateSynonymQueries(Analyzer synonymAnalyzer, SolrParams solrParams) { String origQuery = getQueryStringFromParser(); int queryLen = origQuery.length(); // TODO: make the token stream reusable? TokenStream tokenStream = synonymAnalyzer.tokenStream(Const.IMPOSSIBLE_FIELD_NAME, new StringReader(origQuery)); SortedSetMultimap<Integer, TextInQuery> startPosToTextsInQuery = TreeMultimap.create(); boolean constructPhraseQueries = solrParams.getBool(Params.SYNONYMS_CONSTRUCT_PHRASES, false); boolean bag = solrParams.getBool(Params.SYNONYMS_BAG, false); List<String> synonymBag = new ArrayList<>(); try { tokenStream.reset(); while (tokenStream.incrementToken()) { CharTermAttribute term = tokenStream.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class); TypeAttribute typeAttribute = tokenStream.getAttribute(TypeAttribute.class); if (!typeAttribute.type().equals("shingle")) { // ignore shingles; we only care about synonyms and the original text // TODO: filter other types as well String termToAdd = term.toString(); if (typeAttribute.type().equals("SYNONYM")) { synonymBag.add(termToAdd); } // Don't quote sibgle term term synonyms if (constructPhraseQueries && typeAttribute.type().equals("SYNONYM") && termToAdd.contains(" ")) { // Don't Quote when original is already surrounded by quotes if (offsetAttribute.startOffset() == 0 || offsetAttribute.endOffset() == queryLen || origQuery.charAt(offsetAttribute.startOffset() - 1) != '"' || origQuery.charAt(offsetAttribute.endOffset()) != '"') { // make a phrase out of the synonym termToAdd = new StringBuilder(termToAdd).insert(0, '"').append('"').toString(); } } if (!bag) { // create a graph of all possible synonym combinations, // e.g. dog bite, hound bite, dog nibble, hound nibble, etc. TextInQuery textInQuery = new TextInQuery(termToAdd, offsetAttribute.startOffset(), offsetAttribute.endOffset()); startPosToTextsInQuery.put(offsetAttribute.startOffset(), textInQuery); } } } tokenStream.end(); } catch (IOException e) { throw new RuntimeException("uncaught exception in synonym processing", e); } finally { try { tokenStream.close(); } catch (IOException e) { throw new RuntimeException("uncaught exception in synonym processing", e); } } List<String> alternateQueries = synonymBag; if (!bag) { // use a graph rather than a bag List<List<TextInQuery>> sortedTextsInQuery = new ArrayList<>(startPosToTextsInQuery.values().size()); sortedTextsInQuery.addAll(startPosToTextsInQuery.asMap().values().stream().map(ArrayList::new) .collect(Collectors.toList())); // have to use the start positions and end positions to figure out all possible combinations alternateQueries = buildUpAlternateQueries(solrParams, sortedTextsInQuery); } // save for debugging purposes expandedSynonyms = alternateQueries; return createSynonymQueries(solrParams, alternateQueries); }