List of usage examples for org.apache.lucene.analysis.standard StandardAnalyzer tokenStream
public final TokenStream tokenStream(final String fieldName, final Reader reader)
fieldName, tokenizing the contents of reader. From source file:at.tuwien.ifs.somtoolbox.apps.viewer.DocViewPanel.java
License:Apache License
private void updateWeightHighlighting() { // remove previous highlighting removeHighLights(weightingHighLights); if (weightHighlightBox.isSelected()) { if (inputDataObjects.getTemplateVector() == null) { Logger.getLogger("at.tuwien.ifs.somtoolbox").severe( "Template vector file needed for displaying weights. Load from the File->Data files menu"); weightHighlightBox.setSelected(false); return; }//from ww w . ja v a 2s .c om if (inputDataObjects.getInputData() == null) { Logger.getLogger("at.tuwien.ifs.somtoolbox").severe( "Input data file needed for displaying weights. Load from the File->Data files menu"); weightHighlightBox.setSelected(false); return; } SOMLibTemplateVector tv = inputDataObjects.getTemplateVector(); InputData data = inputDataObjects.getInputData(); InputDatum input = data.getInputDatum(currentInput); double maxValue = data.getMaxValue(); double minValue = data.getMinValue(); double span = maxValue - minValue; // init paints Palette p = paletteSelectionPanel.getSelectedPalette(); int paletteLength = p.getNumberOfColours(); weightPaints = new DefaultHighlighter.DefaultHighlightPainter[paletteLength]; for (int i = 0; i < weightPaints.length; i++) { weightPaints[i] = new DefaultHighlighter.DefaultHighlightPainter(p.getColor(i)); } String text = textPane.getText(); StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_30); TokenStream stream = analyzer.tokenStream("contents", new StringReader(text)); try { while (stream.incrementToken()) { TypeAttribute typeAttribute = stream.getAttribute(TypeAttribute.class); if (!at.tuwien.ifs.somtoolbox.util.StringUtils.equalsAny(typeAttribute.type(), "<APOSTROPHE>")) { TermAttribute termAttribute = stream.getAttribute(TermAttribute.class); String term = termAttribute.term(); if (tv.containsLabel(term)) { int index = tv.getIndex(term); double value = input.getVector().getQuick(index); int colorIndex = (int) (paletteLength / 4d + relativeValue(minValue, span, value) * paletteLength / 2d); OffsetAttribute offsetAttribute = stream.getAttribute(OffsetAttribute.class); offsetAttribute.startOffset(); Object tag = highlighter.addHighlight(offsetAttribute.startOffset(), offsetAttribute.endOffset(), weightPaints[colorIndex]); weightingHighLights.add(tag); } } } } catch (IOException e) { e.printStackTrace(); } catch (BadLocationException e) { e.printStackTrace(); } } }
From source file:ca.ualberta.entitylinking.common.indexing.TFIDF3x.java
License:Open Source License
/** * This function assumes that the TFIDF vector of the document containing text is already * given. We simply build a tfidf-vector of the text out of the docVector. * The purpose of doing this is to save the time computing the tf-idf value for words in * the same document./* w w w. j a v a2 s. co m*/ * * @param text * @param docVector * @return */ public Map<String, Float> TextTFIDFVector(String text, Map<String, Float> docVector) { Map<String, Float> map = new HashMap<String, Float>(); //preprocess the text using StandardAnalyzer (StandardAnalyzer2 + StopAnalyzer). StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_34); TokenStream tokenStream = analyzer.tokenStream("string", new StringReader(text)); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); try { tokenStream.reset(); while (tokenStream.incrementToken()) { String term = charTermAttribute.toString(); if (docVector.containsKey(term)) map.put(term, docVector.get(term)); } } catch (Exception e) { e.printStackTrace(); } analyzer.close(); return map; }
From source file:com.oneis.app.SearchResultExcerptHighlighter.java
License:Mozilla Public License
static public String[] bestHighlightedExcerpts(String escapedText, String searchTerms, int maxExcerptLength) { try {// w w w .j av a 2s . co m // Scorer selects the terms which need highlighting. Created from a 'query' based on the extracted search terms. Scorer scorer; Fragmenter fragmenter; if (searchTerms != null && searchTerms.length() > 0) { QueryParser queryParser = new QueryParser("FIELD", new StandardAnalyzer()); Query query = queryParser.parse(searchTerms); scorer = new QueryScorer(query); fragmenter = new SimpleSpanFragmenter((QueryScorer) scorer, maxExcerptLength); } else { scorer = new NoHighlightingScorer(); fragmenter = new SimpleFragmenter(maxExcerptLength); } // Parse the escaped text into tokens, which retain the positions in the text StandardAnalyzer analyser = new StandardAnalyzer(); TokenStream tokenStream = analyser.tokenStream("FIELD", new StringReader(escapedText)); // Finally, do the highlighting! Highlighter highlighter = new Highlighter(new SimpleHTMLFormatter("<b>", "</b>"), scorer); highlighter.setTextFragmenter(fragmenter); return highlighter.getBestFragments(tokenStream, escapedText, NUMBER_OF_FRAGMENTS); } catch (Exception e) { Logger.getLogger("com.oneis.app").info("Exception in SearchResultExcerptHighlighter: ", e); return null; } }
From source file:com.recomdata.search.Finder.java
License:Open Source License
private Filter buildFilter(LinkedHashMap<String, ArrayList<String>> filterTerms) { BooleanQuery andQuery = new BooleanQuery(); if (filterTerms.containsKey("REPOSITORY")) { ArrayList<String> list = filterTerms.get("REPOSITORY"); ArrayList<Query> queries = new ArrayList<Query>(); for (String value : list) { Term term = new Term("repository", value); TermQuery termQuery = new TermQuery(term); queries.add(termQuery);/*from www. ja v a 2s. c o m*/ } addQueries(andQuery, queries); } if (filterTerms.containsKey("PATH")) { try { ArrayList<String> list = filterTerms.get("PATH"); if (list.size() > 0) { StringReader reader = new StringReader(list.get(0)); StandardAnalyzer analyzer = new StandardAnalyzer(); TokenStream tokenizer = analyzer.tokenStream("path", reader); PhraseQuery phraseQuery = new PhraseQuery(); Token token = new Token(); for (token = tokenizer.next(token); token != null; token = tokenizer.next(token)) { Term term = new Term("path", token.term()); phraseQuery.add(term); } andQuery.add(phraseQuery, BooleanClause.Occur.MUST); } } catch (IOException ex) { // do nothing } } if (filterTerms.containsKey("EXTENSION")) { ArrayList<String> list = filterTerms.get("EXTENSION"); ArrayList<Query> queries = new ArrayList<Query>(); for (String value : list) { Term term = new Term("extension", value.toLowerCase()); TermQuery termQuery = new TermQuery(term); queries.add(termQuery); } addQueries(andQuery, queries); } if (filterTerms.containsKey("NOTEXTENSION")) { ArrayList<String> list = filterTerms.get("NOTEXTENSION"); for (String value : list) { Term term = new Term("extension", value.toLowerCase()); TermQuery termQuery = new TermQuery(term); andQuery.add(termQuery, BooleanClause.Occur.MUST_NOT); } } if (andQuery.clauses().size() > 0) { return new QueryWrapperFilter(andQuery); } return null; }
From source file:com.tamingtext.classifier.bayes.ClassifyDocument.java
License:Apache License
public static void main(String[] args) { log.info("Command-line arguments: " + Arrays.toString(args)); DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); ArgumentBuilder abuilder = new ArgumentBuilder(); GroupBuilder gbuilder = new GroupBuilder(); Option inputOpt = obuilder.withLongName("input").withRequired(true) .withArgument(abuilder.withName("input").withMinimum(1).withMaximum(1).create()) .withDescription("Input file").withShortName("i").create(); Option modelOpt = obuilder.withLongName("model").withRequired(true) .withArgument(abuilder.withName("model").withMinimum(1).withMaximum(1).create()) .withDescription("Model to use when classifying data").withShortName("m").create(); Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h") .create();//w ww. j a v a2 s. co m Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(modelOpt).withOption(helpOpt) .create(); try { Parser parser = new Parser(); parser.setGroup(group); CommandLine cmdLine = parser.parse(args); if (cmdLine.hasOption(helpOpt)) { CommandLineUtil.printHelp(group); return; } File inputFile = new File(cmdLine.getValue(inputOpt).toString()); if (!inputFile.isFile()) { throw new IllegalArgumentException(inputFile + " does not exist or is not a file"); } File modelDir = new File(cmdLine.getValue(modelOpt).toString()); if (!modelDir.isDirectory()) { throw new IllegalArgumentException(modelDir + " does not exist or is not a directory"); } BayesParameters p = new BayesParameters(); p.set("basePath", modelDir.getCanonicalPath()); Datastore ds = new InMemoryBayesDatastore(p); Algorithm a = new BayesAlgorithm(); ClassifierContext ctx = new ClassifierContext(a, ds); ctx.initialize(); //TODO: make the analyzer configurable StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_36); TokenStream ts = analyzer.tokenStream(null, new InputStreamReader(new FileInputStream(inputFile), "UTF-8")); ArrayList<String> tokens = new ArrayList<String>(1000); while (ts.incrementToken()) { tokens.add(ts.getAttribute(CharTermAttribute.class).toString()); } String[] document = tokens.toArray(new String[tokens.size()]); ClassifierResult[] cr = ctx.classifyDocument(document, "unknown", 5); for (ClassifierResult r : cr) { System.err.println(r.getLabel() + "\t" + r.getScore()); } } catch (OptionException e) { log.error("Exception", e); CommandLineUtil.printHelp(group); } catch (IOException e) { log.error("IOException", e); } catch (InvalidDatastoreException e) { log.error("InvalidDataStoreException", e); } finally { } }
From source file:edu.ncsu.dre.impl.engine.LexicalSegregator.java
License:Open Source License
/** * This method implements the Lexical segeration functionality of a given String * which is available as an Object. Returns List of Objects. The lexical segregator * splits the given String artifact into words and sequences of words. Prepares them * into lists of strings as shown below. The scheduler will use these words and sequence * of words to gather more information on the artifact. * /* w w w .j av a2 s . com*/ * @param artifact * * @return Collection<Object> * * {@code * ReturnList * |__List of words * |__List of sub sequences (of configurable length) * } */ public Collection<Object> segregateArtifact(Object artifact) { logger.trace("segregateArtifact(Object artifact)"); String sArtifact = (String) artifact; List<Object> queryList = new ArrayList<Object>(); ArrayList<String> wordList = new ArrayList<String>(); try { StandardAnalyzer analyst = new StandardAnalyzer(); if (sArtifact == null) return queryList; TokenStream tokenStream = analyst.tokenStream("Input Stream", new java.io.StringReader(sArtifact.trim())); Token word = null; do { //Remove duplicates and insert into the list if (word != null && !wordList.contains(word.termText())) wordList.add(word.termText()); word = tokenStream.next(); } while (word != null); } catch (java.io.IOException ioe) { logger.error("IOException occured while parsing input stream!", ioe); } queryList.add(wordList); return queryList; }
From source file:edu.ncsu.dre.impl.engine.SRRSimCGAggregator.java
License:Open Source License
/** * This function retrieves the set of distinct terms from the input query and populates a HashMap * //from w w w .j av a 2 s.c o m * @param String input * @return java.util.Map<String,Integer> */ public Map<String, Integer> getDistinctTerms(String input) { HashMap<String, Integer> termMap = new HashMap<String, Integer>(); try { StandardAnalyzer analyst = new StandardAnalyzer(); if (input == null) return termMap; TokenStream tokenStream = analyst.tokenStream("", new java.io.StringReader(input.trim())); Token word = null; do { if (word != null) { if (termMap.containsKey(word.termText())) //Increment the hash table if it is already present termMap.put(word.termText(), termMap.get(word.termText()) + 1); else termMap.put(word.termText(), 1); //Else insert into the hash table } word = tokenStream.next(); } while (word != null); } catch (java.io.IOException ioe) { logger.error("IOException occured while parsing input string!", ioe); } return termMap; }
From source file:nl.b3p.viewer.stripes.CatalogSearchActionBean.java
License:Open Source License
private static Or createOrFilter(String queryString, String propertyName) { List orList = new ArrayList(); queryString = createQueryString(queryString, false); if (queryString != null && !queryString.trim().equals(defaultWildCard)) { propertyName = createPropertyName(propertyName); PropertyIsEqualTo propertyIsEqualTo = FilterCreator.createPropertyIsEqualTo(queryString, propertyName); StandardAnalyzer standardAnalyzer = new StandardAnalyzer(Version.LUCENE_45, DutchAnalyzer.getDefaultStopSet()); orList.add(propertyIsEqualTo);/* w w w . j ava2 s . co m*/ try { TokenStream tokenStream = standardAnalyzer.tokenStream("", queryString); OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { int startOffset = offsetAttribute.startOffset(); int endOffset = offsetAttribute.endOffset(); String term = charTermAttribute.toString(); PropertyIsLike propertyIsLike = FilterCreator.createPropertyIsLike(term, propertyName); orList.add(propertyIsLike); } tokenStream.close(); } catch (IOException e) { PropertyIsLike propertyIsLike = FilterCreator.createPropertyIsLike(queryString, propertyName); orList.add(propertyIsLike); } } Or or = new Or(new BinaryLogicOpType(orList)); return or; }
From source file:org.haplo.app.SearchResultExcerptHighlighter.java
License:Mozilla Public License
static public String[] bestHighlightedExcerpts(String escapedText, String searchTerms, int maxExcerptLength) { try {//from w ww . j a v a2 s. c o m // Scorer selects the terms which need highlighting. Created from a 'query' based on the extracted search terms. Scorer scorer; Fragmenter fragmenter; if (searchTerms != null && searchTerms.length() > 0) { QueryParser queryParser = new QueryParser("FIELD", new StandardAnalyzer()); Query query = queryParser.parse(searchTerms); scorer = new QueryScorer(query); fragmenter = new SimpleSpanFragmenter((QueryScorer) scorer, maxExcerptLength); } else { scorer = new NoHighlightingScorer(); fragmenter = new SimpleFragmenter(maxExcerptLength); } // Parse the escaped text into tokens, which retain the positions in the text StandardAnalyzer analyser = new StandardAnalyzer(); TokenStream tokenStream = analyser.tokenStream("FIELD", new StringReader(escapedText)); // Finally, do the highlighting! Highlighter highlighter = new Highlighter(new SimpleHTMLFormatter("<b>", "</b>"), scorer); highlighter.setTextFragmenter(fragmenter); return highlighter.getBestFragments(tokenStream, escapedText, NUMBER_OF_FRAGMENTS); } catch (Exception e) { Logger.getLogger("org.haplo.app").info("Exception in SearchResultExcerptHighlighter: ", e); return null; } }
From source file:org.jamwiki.search.LuceneSearchEngine.java
License:LGPL
/** * *///w w w . j a v a2 s. c o m private String retrieveResultSummary(Document document, Highlighter highlighter, StandardAnalyzer analyzer) throws Exception { String content = document.get(ITYPE_CONTENT_PLAIN); TokenStream tokenStream = analyzer.tokenStream(ITYPE_CONTENT_PLAIN, new StringReader(content)); String summary = highlighter.getBestFragments(tokenStream, content, 3, "..."); if (StringUtils.isBlank(summary) && !StringUtils.isBlank(content)) { summary = StringEscapeUtils.escapeHtml(content.substring(0, Math.min(200, content.length()))); if (Math.min(200, content.length()) == 200) { summary += "..."; } } return summary; }