Example usage for org.apache.lucene.analysis.tokenattributes TypeAttribute type

List of usage examples for org.apache.lucene.analysis.tokenattributes TypeAttribute type

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.tokenattributes TypeAttribute type.

Prototype

public String type();

Source Link

Document

Returns this Token's lexical type.

Usage

From source file:analysis.AnalyzerUtils.java

License:Apache License

public static String getType(AttributeSource source) {
    TypeAttribute attr = source.addAttribute(TypeAttribute.class);
    return attr.type();
}

From source file:analysis.AnalyzerUtils.java

License:Apache License

public static void displayTokensWithFullDetails(Analyzer analyzer, String text) throws IOException {

    TokenStream stream = analyzer.tokenStream("contents", // #A
            new StringReader(text));

    TermAttribute term = stream.addAttribute(TermAttribute.class); // #B
    PositionIncrementAttribute posIncr = // #B 
            stream.addAttribute(PositionIncrementAttribute.class); // #B
    OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class); // #B
    TypeAttribute type = stream.addAttribute(TypeAttribute.class); // #B

    int position = 0;
    while (stream.incrementToken()) { // #C

        int increment = posIncr.getPositionIncrement(); // #D
        if (increment > 0) { // #D
            position = position + increment; // #D
            System.out.println(); // #D
            System.out.print(position + ": "); // #D
        }//  ww  w . ja  v a 2  s  .c o  m

        System.out.print("[" + // #E
                term.term() + ":" + // #E
                offset.startOffset() + "->" + // #E
                offset.endOffset() + ":" + // #E
                type.type() + "] "); // #E
    }
    System.out.println();
}

From source file:analysis.FtpFilePathAnalyzer.java

License:Apache License

public static void main(String[] args) {
    Analyzer ana = new FtpFilePathAnalyzer();
    String test2 = "c++c++";
    StringReader reader = new StringReader(test2);
    TokenStream ts = ana.tokenStream("path", reader);
    try {/*w  w  w .  j  a  v  a 2 s . c  o  m*/
        while (ts.incrementToken()) {
            TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class);
            OffsetAttribute offsetAtt = (OffsetAttribute) ts.getAttribute(OffsetAttribute.class);
            PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) ts
                    .getAttribute(PositionIncrementAttribute.class);
            TypeAttribute typeAtt = (TypeAttribute) ts.getAttribute(TypeAttribute.class);
            System.out.print("(" + offsetAtt.startOffset() + "," + offsetAtt.endOffset() + ") ["
                    + posIncrAtt.getPositionIncrement() + "," + typeAtt.type() + "] " + "[" + termAtt.term()
                    + "]");
        }
    } catch (IOException e) {
        e.printStackTrace();
    }
}

From source file:analyzers.DebugAnalyzer.java

License:Apache License

/**
* This method outputs token-by-token analysis of documents.
*
* @param    reader        the reader for the documents
* @param    analyzer      the analyzer /*from w ww .  java2s  .  com*/
* @throws   IOException   cannot load stream
*/
public static void showAnalysisFromStream(Reader reader, Analyzer analyzer) throws IOException {
    TokenStream stream = analyzer.tokenStream("text", reader);
    CharTermAttribute cta = stream.addAttribute(CharTermAttribute.class);
    OffsetAttribute oa = stream.addAttribute(OffsetAttribute.class);
    TypeAttribute typeAtt = stream.addAttribute(TypeAttribute.class);

    try {
        stream.reset();
        while (stream.incrementToken()) {
            // get starting and ending offsets
            int start = oa.startOffset();
            int end = oa.endOffset();

            // text of the token
            String token = cta.toString();

            // part of speech tag for the token
            String tag = typeAtt.type();

            System.out.printf("start: %4d\tend: %4d\tlength: %4d\ttag: %s\ttoken: %s\n", start, end,
                    token.length(), tag, token);
        }
    } finally {
        stream.close();
    }
}

From source file:aos.lucene.analysis.AnalyzerUtils.java

License:Apache License

public static void displayTokensWithFullDetails(Analyzer analyzer, String text) throws IOException {

    TokenStream stream = analyzer.tokenStream("contents", // #A
            new StringReader(text));

    TermAttribute term = stream.addAttribute(TermAttribute.class); // #B
    PositionIncrementAttribute posIncr = // #B
            stream.addAttribute(PositionIncrementAttribute.class); // #B
    OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class); // #B
    TypeAttribute type = stream.addAttribute(TypeAttribute.class); // #B

    int position = 0;
    while (stream.incrementToken()) { // #C

        int increment = posIncr.getPositionIncrement(); // #D
        if (increment > 0) { // #D
            position = position + increment; // #D
            LOGGER.info(); // #D
            System.out.print(position + ": "); // #D
        }//from  w  w  w .  j a  va  2  s . c  o  m

        System.out.print("[" + // #E
                term.term() + ":" + // #E
                offset.startOffset() + "->" + // #E
                offset.endOffset() + ":" + // #E
                type.type() + "] "); // #E
    }
    LOGGER.info();
}

From source file:at.ac.univie.mminf.luceneSKOS.util.AnalyzerUtils.java

License:Apache License

public static void displayTokensWithFullDetails(Analyzer analyzer, String text) throws IOException {

    TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));

    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
    OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
    TypeAttribute type = stream.addAttribute(TypeAttribute.class);
    PayloadAttribute payload = stream.addAttribute(PayloadAttribute.class);

    int position = 0;
    while (stream.incrementToken()) {

        int increment = posIncr.getPositionIncrement();
        if (increment > 0) {
            position = position + increment;
            System.out.println();
            System.out.print(position + ":");
        }/*  ww  w  .  j ava2  s  .  com*/

        Payload pl = payload.getPayload();

        if (pl != null) {
            System.out.print("[" + term.toString() + ":" + offset.startOffset() + "->" + offset.endOffset()
                    + ":" + type.type() + ":" + new String(pl.getData()) + "] ");

        } else {
            System.out.print("[" + term.toString() + ":" + offset.startOffset() + "->" + offset.endOffset()
                    + ":" + type.type() + "] ");

        }

    }
    System.out.println();
}

From source file:at.tuwien.ifs.somtoolbox.apps.viewer.DocViewPanel.java

License:Apache License

private void updateWeightHighlighting() {
    // remove previous highlighting
    removeHighLights(weightingHighLights);
    if (weightHighlightBox.isSelected()) {
        if (inputDataObjects.getTemplateVector() == null) {
            Logger.getLogger("at.tuwien.ifs.somtoolbox").severe(
                    "Template vector file needed for displaying weights. Load from the File->Data files menu");
            weightHighlightBox.setSelected(false);
            return;
        }/*w  w w.j  a va  2  s  .  c  o  m*/
        if (inputDataObjects.getInputData() == null) {
            Logger.getLogger("at.tuwien.ifs.somtoolbox").severe(
                    "Input data file needed for displaying weights. Load from the File->Data files menu");
            weightHighlightBox.setSelected(false);
            return;
        }

        SOMLibTemplateVector tv = inputDataObjects.getTemplateVector();
        InputData data = inputDataObjects.getInputData();
        InputDatum input = data.getInputDatum(currentInput);

        double maxValue = data.getMaxValue();
        double minValue = data.getMinValue();
        double span = maxValue - minValue;

        // init paints
        Palette p = paletteSelectionPanel.getSelectedPalette();
        int paletteLength = p.getNumberOfColours();
        weightPaints = new DefaultHighlighter.DefaultHighlightPainter[paletteLength];
        for (int i = 0; i < weightPaints.length; i++) {
            weightPaints[i] = new DefaultHighlighter.DefaultHighlightPainter(p.getColor(i));
        }

        String text = textPane.getText();
        StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
        TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));
        try {
            while (stream.incrementToken()) {
                TypeAttribute typeAttribute = stream.getAttribute(TypeAttribute.class);
                if (!at.tuwien.ifs.somtoolbox.util.StringUtils.equalsAny(typeAttribute.type(),
                        "<APOSTROPHE>")) {
                    TermAttribute termAttribute = stream.getAttribute(TermAttribute.class);
                    String term = termAttribute.term();
                    if (tv.containsLabel(term)) {
                        int index = tv.getIndex(term);
                        double value = input.getVector().getQuick(index);
                        int colorIndex = (int) (paletteLength / 4d
                                + relativeValue(minValue, span, value) * paletteLength / 2d);
                        OffsetAttribute offsetAttribute = stream.getAttribute(OffsetAttribute.class);
                        offsetAttribute.startOffset();
                        Object tag = highlighter.addHighlight(offsetAttribute.startOffset(),
                                offsetAttribute.endOffset(), weightPaints[colorIndex]);
                        weightingHighLights.add(tag);
                    }
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        } catch (BadLocationException e) {
            e.printStackTrace();
        }
    }
}

From source file:com.antsdb.saltedfish.sql.vdm.LuceneUtil.java

License:Open Source License

static void tokenize(String text, BiConsumer<String, String> lambda) {
    try (StandardAnalyzer analyzer = new StandardAnalyzer()) {
        TokenStream stream = analyzer.tokenStream("", text);
        CharTermAttribute term = stream.getAttribute(CharTermAttribute.class);
        TypeAttribute type = stream.getAttribute(TypeAttribute.class);
        stream.reset();/*from   w ww  .j a v a2 s. co m*/
        while (stream.incrementToken()) {
            lambda.accept(type.type(), term.toString());
        }
    } catch (IOException x) {
        throw new RuntimeException(x);
    }
}

From source file:com.github.bibreen.mecab_ko_lucene_analyzer.MeCabKoStandardTokenizerTest.java

License:Apache License

private String tokenizerToString(Tokenizer tokenizer) throws Exception {
    OffsetAttribute extOffset = tokenizer.addAttribute(OffsetAttribute.class);
    PositionIncrementAttribute posIncrAtt = tokenizer.addAttribute(PositionIncrementAttribute.class);
    PositionLengthAttribute posLengthAtt = tokenizer.addAttribute(PositionLengthAttribute.class);
    CharTermAttribute term = (CharTermAttribute) tokenizer.addAttribute(CharTermAttribute.class);
    TypeAttribute type = (TypeAttribute) tokenizer.addAttribute(TypeAttribute.class);
    SemanticClassAttribute semanticClass = (SemanticClassAttribute) tokenizer
            .addAttribute(SemanticClassAttribute.class);
    PartOfSpeechAttribute pos = (PartOfSpeechAttribute) tokenizer.addAttribute(PartOfSpeechAttribute.class);

    StringBuilder result = new StringBuilder();
    while (tokenizer.incrementToken() == true) {
        result.append(new String(term.buffer(), 0, term.length())).append(":");
        result.append(type.type()).append(":");
        result.append(pos.partOfSpeech()).append(":");
        result.append(semanticClass.semanticClass()).append(":");
        result.append(String.valueOf(posIncrAtt.getPositionIncrement())).append(":");
        result.append(String.valueOf(posLengthAtt.getPositionLength())).append(":");
        result.append(String.valueOf(extOffset.startOffset())).append(":");
        result.append(String.valueOf(extOffset.endOffset()));
        result.append(",");
    }/*from w w  w . ja  va 2s  . c  om*/
    tokenizer.end();
    return result.toString();
}

From source file:com.github.healthonnet.search.SynonymExpandingExtendedDismaxQParserPlugin.java

License:Apache License

/**
 * Given the synonymAnalyzer, returns a list of all alternate queries expanded from the original user query.
 * //ww  w  . j  a  v a  2 s .c  om
 * @param synonymAnalyzer
 * @param solrParams
 * @return
 */
private List<Query> generateSynonymQueries(Analyzer synonymAnalyzer, SolrParams solrParams) {

    String origQuery = getQueryStringFromParser();
    int queryLen = origQuery.length();

    // TODO: make the token stream reusable?
    TokenStream tokenStream = synonymAnalyzer.tokenStream(Const.IMPOSSIBLE_FIELD_NAME,
            new StringReader(origQuery));

    SortedSetMultimap<Integer, TextInQuery> startPosToTextsInQuery = TreeMultimap.create();

    boolean constructPhraseQueries = solrParams.getBool(Params.SYNONYMS_CONSTRUCT_PHRASES, false);

    boolean bag = solrParams.getBool(Params.SYNONYMS_BAG, false);
    List<String> synonymBag = new ArrayList<>();

    try {
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            CharTermAttribute term = tokenStream.getAttribute(CharTermAttribute.class);
            OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
            TypeAttribute typeAttribute = tokenStream.getAttribute(TypeAttribute.class);

            if (!typeAttribute.type().equals("shingle")) {
                // ignore shingles; we only care about synonyms and the original text
                // TODO: filter other types as well

                String termToAdd = term.toString();

                if (typeAttribute.type().equals("SYNONYM")) {
                    synonymBag.add(termToAdd);
                }

                // Don't quote sibgle term term synonyms
                if (constructPhraseQueries && typeAttribute.type().equals("SYNONYM")
                        && termToAdd.contains(" ")) {
                    // Don't Quote when original is already surrounded by quotes
                    if (offsetAttribute.startOffset() == 0 || offsetAttribute.endOffset() == queryLen
                            || origQuery.charAt(offsetAttribute.startOffset() - 1) != '"'
                            || origQuery.charAt(offsetAttribute.endOffset()) != '"') {
                        // make a phrase out of the synonym
                        termToAdd = new StringBuilder(termToAdd).insert(0, '"').append('"').toString();
                    }
                }
                if (!bag) {
                    // create a graph of all possible synonym combinations,
                    // e.g. dog bite, hound bite, dog nibble, hound nibble, etc.
                    TextInQuery textInQuery = new TextInQuery(termToAdd, offsetAttribute.startOffset(),
                            offsetAttribute.endOffset());

                    startPosToTextsInQuery.put(offsetAttribute.startOffset(), textInQuery);
                }
            }
        }
        tokenStream.end();
    } catch (IOException e) {
        throw new RuntimeException("uncaught exception in synonym processing", e);
    } finally {
        try {
            tokenStream.close();
        } catch (IOException e) {
            throw new RuntimeException("uncaught exception in synonym processing", e);
        }
    }

    List<String> alternateQueries = synonymBag;

    if (!bag) {
        // use a graph rather than a bag
        List<List<TextInQuery>> sortedTextsInQuery = new ArrayList<>(startPosToTextsInQuery.values().size());
        sortedTextsInQuery.addAll(startPosToTextsInQuery.asMap().values().stream().map(ArrayList::new)
                .collect(Collectors.toList()));

        // have to use the start positions and end positions to figure out all possible combinations
        alternateQueries = buildUpAlternateQueries(solrParams, sortedTextsInQuery);
    }

    // save for debugging purposes
    expandedSynonyms = alternateQueries;

    return createSynonymQueries(solrParams, alternateQueries);
}