Example usage for org.apache.lucene.analysis.standard StandardAnalyzer tokenStream

List of usage examples for org.apache.lucene.analysis.standard StandardAnalyzer tokenStream

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.standard StandardAnalyzer tokenStream.

Prototype

public final TokenStream tokenStream(final String fieldName, final Reader reader) 

Source Link

Document

Returns a TokenStream suitable for fieldName, tokenizing the contents of reader.

Usage

From source file:at.tuwien.ifs.somtoolbox.apps.viewer.DocViewPanel.java

License:Apache License

private void updateWeightHighlighting() {
    // remove previous highlighting
    removeHighLights(weightingHighLights);
    if (weightHighlightBox.isSelected()) {
        if (inputDataObjects.getTemplateVector() == null) {
            Logger.getLogger("at.tuwien.ifs.somtoolbox").severe(
                    "Template vector file needed for displaying weights. Load from the File->Data files menu");
            weightHighlightBox.setSelected(false);
            return;
        }//from  ww  w  . ja v a  2s .c om
        if (inputDataObjects.getInputData() == null) {
            Logger.getLogger("at.tuwien.ifs.somtoolbox").severe(
                    "Input data file needed for displaying weights. Load from the File->Data files menu");
            weightHighlightBox.setSelected(false);
            return;
        }

        SOMLibTemplateVector tv = inputDataObjects.getTemplateVector();
        InputData data = inputDataObjects.getInputData();
        InputDatum input = data.getInputDatum(currentInput);

        double maxValue = data.getMaxValue();
        double minValue = data.getMinValue();
        double span = maxValue - minValue;

        // init paints
        Palette p = paletteSelectionPanel.getSelectedPalette();
        int paletteLength = p.getNumberOfColours();
        weightPaints = new DefaultHighlighter.DefaultHighlightPainter[paletteLength];
        for (int i = 0; i < weightPaints.length; i++) {
            weightPaints[i] = new DefaultHighlighter.DefaultHighlightPainter(p.getColor(i));
        }

        String text = textPane.getText();
        StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
        TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));
        try {
            while (stream.incrementToken()) {
                TypeAttribute typeAttribute = stream.getAttribute(TypeAttribute.class);
                if (!at.tuwien.ifs.somtoolbox.util.StringUtils.equalsAny(typeAttribute.type(),
                        "<APOSTROPHE>")) {
                    TermAttribute termAttribute = stream.getAttribute(TermAttribute.class);
                    String term = termAttribute.term();
                    if (tv.containsLabel(term)) {
                        int index = tv.getIndex(term);
                        double value = input.getVector().getQuick(index);
                        int colorIndex = (int) (paletteLength / 4d
                                + relativeValue(minValue, span, value) * paletteLength / 2d);
                        OffsetAttribute offsetAttribute = stream.getAttribute(OffsetAttribute.class);
                        offsetAttribute.startOffset();
                        Object tag = highlighter.addHighlight(offsetAttribute.startOffset(),
                                offsetAttribute.endOffset(), weightPaints[colorIndex]);
                        weightingHighLights.add(tag);
                    }
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        } catch (BadLocationException e) {
            e.printStackTrace();
        }
    }
}

From source file:ca.ualberta.entitylinking.common.indexing.TFIDF3x.java

License:Open Source License

/**
 * This function assumes that the TFIDF vector of the document containing text is already
 * given. We simply build a tfidf-vector of the text out of the docVector. 
 * The purpose of doing this is to save the time computing the tf-idf value for words in
 * the same document./*  w w  w.  j  a v a2  s. co  m*/
 * 
 * @param text
 * @param docVector
 * @return
 */
public Map<String, Float> TextTFIDFVector(String text, Map<String, Float> docVector) {
    Map<String, Float> map = new HashMap<String, Float>();

    //preprocess the text using StandardAnalyzer (StandardAnalyzer2 + StopAnalyzer).
    StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_34);
    TokenStream tokenStream = analyzer.tokenStream("string", new StringReader(text));
    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);

    try {
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            String term = charTermAttribute.toString();

            if (docVector.containsKey(term))
                map.put(term, docVector.get(term));
        }
    } catch (Exception e) {
        e.printStackTrace();
    }

    analyzer.close();

    return map;
}

From source file:com.oneis.app.SearchResultExcerptHighlighter.java

License:Mozilla Public License

static public String[] bestHighlightedExcerpts(String escapedText, String searchTerms, int maxExcerptLength) {
    try {// w w w  .j av  a 2s . co m
        // Scorer selects the terms which need highlighting. Created from a 'query' based on the extracted search terms.
        Scorer scorer;
        Fragmenter fragmenter;
        if (searchTerms != null && searchTerms.length() > 0) {
            QueryParser queryParser = new QueryParser("FIELD", new StandardAnalyzer());
            Query query = queryParser.parse(searchTerms);
            scorer = new QueryScorer(query);
            fragmenter = new SimpleSpanFragmenter((QueryScorer) scorer, maxExcerptLength);
        } else {
            scorer = new NoHighlightingScorer();
            fragmenter = new SimpleFragmenter(maxExcerptLength);
        }

        // Parse the escaped text into tokens, which retain the positions in the text
        StandardAnalyzer analyser = new StandardAnalyzer();
        TokenStream tokenStream = analyser.tokenStream("FIELD", new StringReader(escapedText));

        // Finally, do the highlighting!
        Highlighter highlighter = new Highlighter(new SimpleHTMLFormatter("<b>", "</b>"), scorer);
        highlighter.setTextFragmenter(fragmenter);
        return highlighter.getBestFragments(tokenStream, escapedText, NUMBER_OF_FRAGMENTS);
    } catch (Exception e) {
        Logger.getLogger("com.oneis.app").info("Exception in SearchResultExcerptHighlighter: ", e);
        return null;
    }
}

From source file:com.recomdata.search.Finder.java

License:Open Source License

private Filter buildFilter(LinkedHashMap<String, ArrayList<String>> filterTerms) {

    BooleanQuery andQuery = new BooleanQuery();
    if (filterTerms.containsKey("REPOSITORY")) {
        ArrayList<String> list = filterTerms.get("REPOSITORY");
        ArrayList<Query> queries = new ArrayList<Query>();
        for (String value : list) {
            Term term = new Term("repository", value);
            TermQuery termQuery = new TermQuery(term);
            queries.add(termQuery);/*from www. ja  v a 2s. c o m*/
        }
        addQueries(andQuery, queries);
    }

    if (filterTerms.containsKey("PATH")) {
        try {
            ArrayList<String> list = filterTerms.get("PATH");
            if (list.size() > 0) {
                StringReader reader = new StringReader(list.get(0));
                StandardAnalyzer analyzer = new StandardAnalyzer();
                TokenStream tokenizer = analyzer.tokenStream("path", reader);
                PhraseQuery phraseQuery = new PhraseQuery();
                Token token = new Token();
                for (token = tokenizer.next(token); token != null; token = tokenizer.next(token)) {
                    Term term = new Term("path", token.term());
                    phraseQuery.add(term);
                }
                andQuery.add(phraseQuery, BooleanClause.Occur.MUST);
            }
        } catch (IOException ex) {
            // do nothing
        }
    }

    if (filterTerms.containsKey("EXTENSION")) {
        ArrayList<String> list = filterTerms.get("EXTENSION");
        ArrayList<Query> queries = new ArrayList<Query>();
        for (String value : list) {
            Term term = new Term("extension", value.toLowerCase());
            TermQuery termQuery = new TermQuery(term);
            queries.add(termQuery);
        }
        addQueries(andQuery, queries);
    }

    if (filterTerms.containsKey("NOTEXTENSION")) {
        ArrayList<String> list = filterTerms.get("NOTEXTENSION");
        for (String value : list) {
            Term term = new Term("extension", value.toLowerCase());
            TermQuery termQuery = new TermQuery(term);
            andQuery.add(termQuery, BooleanClause.Occur.MUST_NOT);
        }
    }

    if (andQuery.clauses().size() > 0) {
        return new QueryWrapperFilter(andQuery);
    }
    return null;

}

From source file:com.tamingtext.classifier.bayes.ClassifyDocument.java

License:Apache License

public static void main(String[] args) {
    log.info("Command-line arguments: " + Arrays.toString(args));

    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option inputOpt = obuilder.withLongName("input").withRequired(true)
            .withArgument(abuilder.withName("input").withMinimum(1).withMaximum(1).create())
            .withDescription("Input file").withShortName("i").create();

    Option modelOpt = obuilder.withLongName("model").withRequired(true)
            .withArgument(abuilder.withName("model").withMinimum(1).withMaximum(1).create())
            .withDescription("Model to use when classifying data").withShortName("m").create();

    Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
            .create();//w  ww. j  a  v  a2  s. co  m

    Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(modelOpt).withOption(helpOpt)
            .create();

    try {
        Parser parser = new Parser();
        parser.setGroup(group);
        CommandLine cmdLine = parser.parse(args);

        if (cmdLine.hasOption(helpOpt)) {
            CommandLineUtil.printHelp(group);
            return;
        }

        File inputFile = new File(cmdLine.getValue(inputOpt).toString());

        if (!inputFile.isFile()) {
            throw new IllegalArgumentException(inputFile + " does not exist or is not a file");
        }

        File modelDir = new File(cmdLine.getValue(modelOpt).toString());

        if (!modelDir.isDirectory()) {
            throw new IllegalArgumentException(modelDir + " does not exist or is not a directory");
        }

        BayesParameters p = new BayesParameters();
        p.set("basePath", modelDir.getCanonicalPath());
        Datastore ds = new InMemoryBayesDatastore(p);
        Algorithm a = new BayesAlgorithm();
        ClassifierContext ctx = new ClassifierContext(a, ds);
        ctx.initialize();

        //TODO: make the analyzer configurable
        StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
        TokenStream ts = analyzer.tokenStream(null,
                new InputStreamReader(new FileInputStream(inputFile), "UTF-8"));

        ArrayList<String> tokens = new ArrayList<String>(1000);
        while (ts.incrementToken()) {
            tokens.add(ts.getAttribute(CharTermAttribute.class).toString());
        }
        String[] document = tokens.toArray(new String[tokens.size()]);

        ClassifierResult[] cr = ctx.classifyDocument(document, "unknown", 5);

        for (ClassifierResult r : cr) {
            System.err.println(r.getLabel() + "\t" + r.getScore());
        }
    } catch (OptionException e) {
        log.error("Exception", e);
        CommandLineUtil.printHelp(group);
    } catch (IOException e) {
        log.error("IOException", e);
    } catch (InvalidDatastoreException e) {
        log.error("InvalidDataStoreException", e);
    } finally {

    }
}

From source file:edu.ncsu.dre.impl.engine.LexicalSegregator.java

License:Open Source License

/**
 * This method implements the Lexical segeration functionality of a given String
 * which is available as an Object. Returns List of Objects. The lexical segregator 
 * splits the given String artifact into words and sequences of words. Prepares them 
 * into lists of strings as shown below. The scheduler will use these words and sequence 
 * of words to gather more information on the artifact.
 * /*  w w w  .j av  a2 s . com*/
 * @param artifact
 * 
 * @return Collection<Object>
 *  
 * {@code
 * ReturnList
 *    |__List of words 
 *      |__List of sub sequences (of configurable length)
 * }
 */
public Collection<Object> segregateArtifact(Object artifact) {

    logger.trace("segregateArtifact(Object artifact)");

    String sArtifact = (String) artifact;

    List<Object> queryList = new ArrayList<Object>();

    ArrayList<String> wordList = new ArrayList<String>();

    try {
        StandardAnalyzer analyst = new StandardAnalyzer();

        if (sArtifact == null)
            return queryList;

        TokenStream tokenStream = analyst.tokenStream("Input Stream",
                new java.io.StringReader(sArtifact.trim()));

        Token word = null;
        do { //Remove duplicates and insert into the list
            if (word != null && !wordList.contains(word.termText()))
                wordList.add(word.termText());
            word = tokenStream.next();
        } while (word != null);
    } catch (java.io.IOException ioe) {
        logger.error("IOException occured while parsing input stream!", ioe);
    }
    queryList.add(wordList);
    return queryList;
}

From source file:edu.ncsu.dre.impl.engine.SRRSimCGAggregator.java

License:Open Source License

/**
 * This function retrieves the set of distinct terms from the input query and populates a HashMap
 * //from  w w w .j  av  a 2  s.c  o  m
 * @param String input
 * @return java.util.Map<String,Integer> 
 */
public Map<String, Integer> getDistinctTerms(String input) {
    HashMap<String, Integer> termMap = new HashMap<String, Integer>();

    try {
        StandardAnalyzer analyst = new StandardAnalyzer();

        if (input == null)
            return termMap;

        TokenStream tokenStream = analyst.tokenStream("", new java.io.StringReader(input.trim()));
        Token word = null;
        do {
            if (word != null) {
                if (termMap.containsKey(word.termText())) //Increment the hash table if it is already present
                    termMap.put(word.termText(), termMap.get(word.termText()) + 1);
                else
                    termMap.put(word.termText(), 1); //Else insert into the hash table                                       
            }

            word = tokenStream.next();
        } while (word != null);
    } catch (java.io.IOException ioe) {
        logger.error("IOException occured while parsing input string!", ioe);
    }
    return termMap;
}

From source file:nl.b3p.viewer.stripes.CatalogSearchActionBean.java

License:Open Source License

private static Or createOrFilter(String queryString, String propertyName) {
    List orList = new ArrayList();
    queryString = createQueryString(queryString, false);
    if (queryString != null && !queryString.trim().equals(defaultWildCard)) {

        propertyName = createPropertyName(propertyName);

        PropertyIsEqualTo propertyIsEqualTo = FilterCreator.createPropertyIsEqualTo(queryString, propertyName);

        StandardAnalyzer standardAnalyzer = new StandardAnalyzer(Version.LUCENE_45,
                DutchAnalyzer.getDefaultStopSet());

        orList.add(propertyIsEqualTo);/*  w  w  w . j ava2  s  . co m*/
        try {

            TokenStream tokenStream = standardAnalyzer.tokenStream("", queryString);
            OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
            CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);

            tokenStream.reset();
            while (tokenStream.incrementToken()) {
                int startOffset = offsetAttribute.startOffset();
                int endOffset = offsetAttribute.endOffset();
                String term = charTermAttribute.toString();
                PropertyIsLike propertyIsLike = FilterCreator.createPropertyIsLike(term, propertyName);
                orList.add(propertyIsLike);
            }
            tokenStream.close();
        } catch (IOException e) {
            PropertyIsLike propertyIsLike = FilterCreator.createPropertyIsLike(queryString, propertyName);
            orList.add(propertyIsLike);
        }
    }

    Or or = new Or(new BinaryLogicOpType(orList));

    return or;
}

From source file:org.haplo.app.SearchResultExcerptHighlighter.java

License:Mozilla Public License

static public String[] bestHighlightedExcerpts(String escapedText, String searchTerms, int maxExcerptLength) {
    try {//from   w ww  .  j a v a2 s.  c o m
        // Scorer selects the terms which need highlighting. Created from a 'query' based on the extracted search terms.
        Scorer scorer;
        Fragmenter fragmenter;
        if (searchTerms != null && searchTerms.length() > 0) {
            QueryParser queryParser = new QueryParser("FIELD", new StandardAnalyzer());
            Query query = queryParser.parse(searchTerms);
            scorer = new QueryScorer(query);
            fragmenter = new SimpleSpanFragmenter((QueryScorer) scorer, maxExcerptLength);
        } else {
            scorer = new NoHighlightingScorer();
            fragmenter = new SimpleFragmenter(maxExcerptLength);
        }

        // Parse the escaped text into tokens, which retain the positions in the text
        StandardAnalyzer analyser = new StandardAnalyzer();
        TokenStream tokenStream = analyser.tokenStream("FIELD", new StringReader(escapedText));

        // Finally, do the highlighting!
        Highlighter highlighter = new Highlighter(new SimpleHTMLFormatter("<b>", "</b>"), scorer);
        highlighter.setTextFragmenter(fragmenter);
        return highlighter.getBestFragments(tokenStream, escapedText, NUMBER_OF_FRAGMENTS);
    } catch (Exception e) {
        Logger.getLogger("org.haplo.app").info("Exception in SearchResultExcerptHighlighter: ", e);
        return null;
    }
}

From source file:org.jamwiki.search.LuceneSearchEngine.java

License:LGPL

/**
 *
 *///w w  w  .  j a  v  a2 s.  c  o  m
private String retrieveResultSummary(Document document, Highlighter highlighter, StandardAnalyzer analyzer)
        throws Exception {
    String content = document.get(ITYPE_CONTENT_PLAIN);
    TokenStream tokenStream = analyzer.tokenStream(ITYPE_CONTENT_PLAIN, new StringReader(content));
    String summary = highlighter.getBestFragments(tokenStream, content, 3, "...");
    if (StringUtils.isBlank(summary) && !StringUtils.isBlank(content)) {
        summary = StringEscapeUtils.escapeHtml(content.substring(0, Math.min(200, content.length())));
        if (Math.min(200, content.length()) == 200) {
            summary += "...";
        }
    }
    return summary;
}