List of usage examples for org.apache.lucene.analysis TokenStream incrementToken
public abstract boolean incrementToken() throws IOException;
From source file:nmsu.cs.DocParser.java
License:Open Source License
/** * /*from ww w . j ava 2 s .com*/ * @param docs * @param pub2bugsIds * @param dim1size * @param dim2size * @return * @throws IOException */ public static Map<Integer, Map<Integer, Map<Integer, Double>>> calculateAspectDocumentMap( Map<Integer, Doc> docs, Map<String, Integer> vocabulary, List<String> vocabInverse, BidiMap pub2bugsIds, final int dim1size, final int dim2size, final int dim3size) { // double[][][] apsectDocMatrix = new double[dim1size][dim2size][dim3size]; //word -> aspect -> document -> occur number Map<Integer, Map<Integer, Map<Integer, Double>>> w2a2d2num = new HashMap<Integer, Map<Integer, Map<Integer, Double>>>(); try { for (Object pubid_ : pub2bugsIds.keySet()) { int pubid = (Integer) pubid_; //document id int bugid = getPubid2BugsId(pubid, pub2bugsIds); Doc doc = docs.get(pubid); List<String> textList = doc.getText(); //aspect id for (int i = 0; i < textList.size(); i++) { //TODO:: remove URL, @XXX and non-ASCII characters //DONE String text = textList.get(i); //remove url String t1 = text.replaceAll( "(https?|ftp|file)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]", ""); //remove non-ascii String t2 = t1.replaceAll("[^\\x00-\\x7F]", ""); // stemmed text token stream TokenStream tokenStream = getStemmer().tokenStream("", new StringReader(t2)); while (tokenStream.incrementToken()) { String token = tokenStream.reflectAsString(false); int termBegin = 5; int termEnd = token.indexOf(","); final String wordstem = token.substring(termBegin, termEnd); //w id Integer vocabIndex = vocabulary.get(wordstem); if (vocabIndex != null) { //instance.add(vocabIndex, bugid, 1.0); // apsectDocMatrix[vocabIndex][i][bugid] +=1.0; //initialize w2a2d2count map Double oldValue = Util.get3Map(w2a2d2num, vocabIndex, i, bugid); Util.update3Map(w2a2d2num, vocabIndex, i, bugid, oldValue + 1); } } } } } catch (IOException e) { throw new RuntimeException(e); } return w2a2d2num; }
From source file:nmsu.cs.DocParser.java
License:Open Source License
public static double[][][] calculateAspectDocumentMatrix(Map<Integer, Doc> docs, Map<String, Integer> vocabulary, List<String> vocabInverse, BidiMap pub2bugsIds, final int dim1size, final int dim2size, final int dim3size) { double[][][] apsectDocMatrix = new double[dim1size][dim2size][dim3size]; //word -> aspect -> document -> occur number try {//from w ww . j a v a2 s . c o m for (Object pubid_ : pub2bugsIds.keySet()) { int pubid = (Integer) pubid_; //document id int bugid = getPubid2BugsId(pubid, pub2bugsIds); Doc doc = docs.get(pubid); List<String> textList = doc.getText(); //aspect id for (int i = 0; i < textList.size(); i++) { //TODO:: remove URL, @XXX and non-ASCII characters //DONE String text = textList.get(i); //remove url String t1 = text.replaceAll( "(https?|ftp|file)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]", ""); //remove non-ascii String t2 = t1.replaceAll("[^\\x00-\\x7F]", ""); // stemmed text token stream TokenStream tokenStream = getStemmer().tokenStream("", new StringReader(t2)); // for (Token tok = tokenStream.next(); tok != null; tok = tokenStream.next()) { while (tokenStream.incrementToken()) { String token = tokenStream.reflectAsString(false); int termBegin = 5; int termEnd = token.indexOf(","); final String wordstem = token.substring(termBegin, termEnd); //w id Integer vocabIndex = vocabulary.get(wordstem); if (vocabIndex != null) { //instance.add(vocabIndex, bugid, 1.0); apsectDocMatrix[vocabIndex][i][bugid] += 1.0; } } } } } catch (IOException e) { throw new RuntimeException(e); } return apsectDocMatrix; }
From source file:nmsu.cs.DocParser.java
License:Open Source License
/** * /* w w w. j a v a 2s .c o m*/ * @param docs * @param vocabulary * @param vocabInverse * @param pub2bugsIds * @param dim1size * @param dim2size * @return */ public static double[][] calculateWordDocumentMatrix(Map<Integer, Doc> docs, Map<String, Integer> vocabulary, List<String> vocabInverse, BidiMap pub2bugsIds, final int dim1size, final int dim2size) { double[][] wordDocMatrix = new double[dim1size][dim2size]; try { assert (dim1size == vocabInverse.size()) : "dimensions do not match in dim 1: dim1size=" + dim1size; //assert (dim2size == Collections.<Integer>max(pub2bugsIds.values()) + 1) : // "max of pub2bugsIds does not match in dim 2: dim1size=" + dim2size + // " Collections.<Integer>max(pub2bugsIds.values()=" + Collections.<Integer>max(pub2bugsIds.values()); for (Object pubid_ : pub2bugsIds.keySet()) { int pubid = (Integer) pubid_; int bugid = getPubid2BugsId(pubid, pub2bugsIds); Doc doc = docs.get(pubid); String text = doc.getFullText(); assert (text.length() > 1); //remove url String t1 = text .replaceAll("(https?|ftp|file)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]", ""); //remove non-ascii String t2 = t1.replaceAll("[^\\x00-\\x7F]", ""); TokenStream tokenStream = getStemmer().tokenStream("", new StringReader(t2)); // for (Token tok = tokenStream.next(); tok != null; tok = tokenStream.next()) { // final String wordstem = tok.termText(); while (tokenStream.incrementToken()) { String token = tokenStream.reflectAsString(false); int termBegin = 5; int termEnd = token.indexOf(","); final String wordstem = token.substring(termBegin, termEnd); Integer vocabIndex = vocabulary.get(wordstem); if (vocabIndex != null) { //instance.add(vocabIndex, bugid, 1.0); wordDocMatrix[vocabIndex][bugid] += 1.0; } } } return wordDocMatrix; } catch (IOException e) { throw new RuntimeException(e); } }
From source file:nmsu.cs.DocParser.java
License:Open Source License
public static Map<Integer, Map<Integer, Double>> calculateWordDocumentMap(Map<Integer, Doc> docs, Map<String, Integer> vocabulary, List<String> vocabInverse, BidiMap pub2bugsIds, final int dim1size, final int dim2size) { Map<Integer, Map<Integer, Double>> wordDocMap = new HashMap<Integer, Map<Integer, Double>>(); try {/*from www . ja va2 s. co m*/ assert (dim1size == vocabInverse.size()) : "dimensions do not match in dim 1: dim1size=" + dim1size; //assert (dim2size == Collections.<Integer>max(pub2bugsIds.values()) + 1) : // "max of pub2bugsIds does not match in dim 2: dim1size=" + dim2size + // " Collections.<Integer>max(pub2bugsIds.values()=" + Collections.<Integer>max(pub2bugsIds.values()); for (Object pubid_ : pub2bugsIds.keySet()) { int pubid = (Integer) pubid_; int bugid = getPubid2BugsId(pubid, pub2bugsIds); Doc doc = docs.get(pubid); String text = doc.getFullText(); assert (text.length() > 1); //remove url String t1 = text .replaceAll("(https?|ftp|file)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]", ""); //remove non-ascii String t2 = t1.replaceAll("[^\\x00-\\x7F]", ""); TokenStream tokenStream = getStemmer().tokenStream("", new StringReader(t2)); // for (Token tok = tokenStream.next(); tok != null; tok = tokenStream.next()) { // final String wordstem = tok.termText(); while (tokenStream.incrementToken()) { String token = tokenStream.reflectAsString(false); int termBegin = 5; int termEnd = token.indexOf(","); final String wordstem = token.substring(termBegin, termEnd); Integer vocabIndex = vocabulary.get(wordstem); if (vocabIndex != null) { //instance.add(vocabIndex, bugid, 1.0); Double oldValue = Util.get2Map(wordDocMap, vocabIndex, bugid); Util.update2Map(wordDocMap, vocabIndex, bugid, oldValue + 1.0); } } } return wordDocMap; } catch (IOException e) { throw new RuntimeException(e); } }
From source file:nmsu.cs.DocParser.java
License:Open Source License
/** * Huiping noted 2012-02-29//from ww w .ja v a 2 s .c o m * * Extract vocabularies from all documents and Build TF array, DF array * */ public void createVocabulary(Map<Integer, Doc> docs, int minWordsOccurence, //Map<String, Integer> vocabularyMap, List<String> vocabInverse) { vocabularyMap.clear(); docInverse = hashDocs(new ArrayList<Doc>(docs.values())); int corpus_length = 0; try { // calculate origVocabulary //Map from vocabularity to its index position origVocabulary = new HashMap<String, Integer>(); //word ordered by word index origVocabInverse = new ArrayList<String>(); //list of documents containing word, ordered by word index setOrigVocabInverse2Doc(new ArrayList<List<Integer>>()); // each element holds a list of the frequency with which a word occurs in the document. // order is the same as above origVocabInverse2DocFrequency = new ArrayList<List<Integer>>(); assert (docs.size() > 0); for (int pubid : docs.keySet()) { Doc doc = docs.get(pubid); String text = doc.getFullText(); assert (text.length() > 1); //remove url String t1 = text .replaceAll("(https?|ftp|file)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]", ""); //remove non-ascii String t2 = t1.replaceAll("[^\\x00-\\x7F]", ""); TokenStream tokenStream = getStemmer().tokenStream("", new StringReader(t2)); // for (Token tok = tokenStream.next(); tok != null; tok = tokenStream.next()) { // final String wordstem = tok.termText(); while (tokenStream.incrementToken()) { corpus_length++; String token = tokenStream.reflectAsString(false); int termBegin = 5; int termEnd = token.indexOf(","); final String wordstem = token.substring(termBegin, termEnd); // get the vocab index Integer vocabIndex = origVocabulary.get(wordstem); if (vocabIndex == null) { vocabIndex = origVocabulary.size(); origVocabulary.put(wordstem, vocabIndex); // hash origVocabInverse.add(wordstem); // add to list getOrigVocabInverse2Doc().add(new ArrayList<Integer>(Collections.singleton(pubid))); // add to document hash origVocabInverse2DocFrequency.add(new ArrayList<Integer>(Collections.singleton(1))); // add to document hash assert (vocabIndex == origVocabInverse.size() - 1) : "vocabIndex = " + vocabIndex + " origVocabInverse.size()-1=" + (origVocabInverse.size() - 1); assert (vocabIndex == getOrigVocabInverse2Doc().size() - 1) : "vocabIndex =" + vocabIndex + " origVocabInverse2Doc.size()-1=" + (getOrigVocabInverse2Doc().size() - 1); assert (getOrigVocabInverse2Doc().get(vocabIndex) .size() == 1) : "origVocabInverse2Doc.get(vocabIndex).size()=" + getOrigVocabInverse2Doc().get(vocabIndex).size(); assert (origVocabInverse2DocFrequency.get(vocabIndex) .size() == 1) : "origVocabInverse2DocFrequency.get(vocabIndex).size()=" + origVocabInverse2DocFrequency.get(vocabIndex).size(); } else { assert (origVocabInverse.get(origVocabulary.get(wordstem)) .equals(wordstem)) : "Inverse vocabulary broken. wordstem=" + wordstem + " vocabulary.get(wordstem)=" + origVocabulary.get(wordstem) + " origVocabInverse.get(" + origVocabulary.get(wordstem) + ")=" + origVocabInverse.get(origVocabulary.get(wordstem)) + ")"; // add 1 to the frequencyList of (vocabIndex,doc) List<Integer> doclist = getOrigVocabInverse2Doc().get(vocabIndex); List<Integer> freqlist = origVocabInverse2DocFrequency.get(vocabIndex); int docindex1 = doclist.indexOf(pubid); if (docindex1 < 0) { docindex1 = doclist.size(); doclist.add(pubid); } while (docindex1 >= freqlist.size()) { freqlist.add(0); } int docindex = docindex1; final int oldFreq = freqlist.get(docindex); freqlist.set(docindex, oldFreq + 1); assert (getOrigVocabInverse2Doc().size() == origVocabInverse2DocFrequency.size()); assert (origVocabInverse2DocFrequency.get(vocabIndex).get(docindex) > 0); } } } // calculate (real) vocabulary (use by the inference algorithm) // throw out words that occur only within one document (and thus achieve no coupling) assert (getOrigVocabInverse2Doc().size() > 1); assert (origVocabInverse.size() > 1); //vocabulary = new HashMap<String, Integer>(); //vocabInverse = new ArrayList<String>(); for (int w = 0; w < getOrigVocabInverse2Doc().size(); w++) { if (getOrigVocabInverse2Doc().get(w).size() >= minWordsOccurence) { int vocaIndex = vocabInverse.size(); vocabInverse.add(origVocabInverse.get(w)); vocabularyMap.put(origVocabInverse.get(w), vocaIndex); } } // System.out.println(Debugger.getCallerPosition()+"vocabInverse (" + vocabInverse.size() + ") " + vocabInverse); // System.out.println(Debugger.getCallerPosition()+"origVocabInverse (" + origVocabInverse.size() + ")" + origVocabInverse); //System.out.println(Debugger.getCallerPosition()+Arrays.toString(vocabInverse.toArray())); assert (vocabInverse.size() > 0); } catch (IOException e) { throw new RuntimeException(e); //todo handle } Constant.tokenNum = vocabularyMap.size(); System.out.println(Debugger.getCallerPosition() + " corpus length is " + corpus_length); }
From source file:org.aksw.palmetto.corpus.lucene.SimpleAnalyzerTest.java
License:Open Source License
public void test(boolean lowercase) throws Exception { SimpleAnalyzer analyzer = new SimpleAnalyzer(lowercase); TokenStream stream = analyzer.tokenStream("test", text); CharTermAttribute token;//from ww w .j a va 2 s . c om int count = 0; stream.reset(); while (stream.incrementToken()) { Assert.assertTrue(count < expectedTokens.length); token = stream.getAttribute(CharTermAttribute.class); if (lowercase) { Assert.assertEquals(expectedTokens[count].toLowerCase(), token.toString()); } else { Assert.assertEquals(expectedTokens[count], token.toString()); } ++count; } Assert.assertEquals(expectedTokens.length, count); analyzer.close(); }
From source file:org.alfresco.repo.search.impl.lucene.analysis.MLAnalayserTest.java
License:Open Source License
/** * Check that the TokenStream yields the exact tokens specified. * Note that order is not checked, since the map of locales will not provide a * predictable ordering when enumerated. * /* ww w . ja va 2 s .co m*/ * The expected list of tokens may contain the same token more than once and * the number of instances will have to match the number found in the stream. * * @param ts TokenStream to inspect. * @param expectedTokens List of tokens in the order expected from the stream. * @throws IOException */ private void verifyTokenStream(TokenStream ts, List<String> expectedTokens) throws IOException { final int expectedCount = expectedTokens.size(); int count = 0; CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); try { ts.reset(); while (ts.incrementToken()) { count++; System.out.println("Token: " + termAtt.toString()); if (expectedTokens.contains(termAtt.toString())) { // remove an instance of the term text so that it is not matched again expectedTokens.remove(termAtt.toString()); } else { fail("Unexpected token: " + termAtt.toString()); } } ts.end(); } finally { ts.close(); } assertEquals("Incorrect number of tokens generated.", expectedCount, count); }
From source file:org.alfresco.repo.search.impl.lucene.analysis.PathTokenFilterTest.java
License:Open Source License
private void tokenise(TokenStream ts, String[] tokens) throws IOException { int i = 0;//from w w w.ja va 2 s . co m CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class); try { ts.reset(); while (ts.incrementToken()) { System.out.println("token: " + ts.reflectAsString(true)); String termText = termAtt.toString(); if (typeAtt.type().equals(PathTokenFilter.TOKEN_TYPE_PATH_ELEMENT_NAMESPACE)) { assert (i % 2 == 0); assertEquals(termText, tokens[i++]); } else if (typeAtt.type().equals(PathTokenFilter.TOKEN_TYPE_PATH_ELEMENT_NAMESPACE_PREFIX)) { assert (i % 2 == 0); assertEquals(termText, tokens[i++]); } else if (typeAtt.type().equals(PathTokenFilter.TOKEN_TYPE_PATH_ELEMENT_NAME)) { assert (i % 2 == 1); assertEquals(termText, tokens[i++]); } } ts.end(); } finally { ts.close(); } if (i != tokens.length) { fail("Invalid number of tokens, found " + i + " and expected " + tokens.length); } }
From source file:org.alfresco.solr.AlfrescoFieldType.java
License:Open Source License
public static BytesRef analyzeMultiTerm(String field, String part, Analyzer analyzerIn) { if (part == null || analyzerIn == null) return null; TokenStream source = null; try {/* www .j av a2 s .com*/ source = analyzerIn.tokenStream(field, part); source.reset(); TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class); BytesRef bytes = termAtt.getBytesRef(); if (!source.incrementToken()) throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "analyzer returned no terms for multiTerm term: " + part); if (source.incrementToken()) throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "analyzer returned too many terms for multiTerm term: " + part); source.end(); return BytesRef.deepCopyOf(bytes); } catch (IOException e) { throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "error analyzing range part: " + part, e); } finally { IOUtils.closeWhileHandlingException(source); } }
From source file:org.alfresco.solr.query.Solr4QueryParser.java
License:Open Source License
private ArrayList<String> getTokens(IndexableField indexableField) throws IOException { ArrayList<String> tokens = new ArrayList<String>(); TokenStream ts = indexableField.tokenStream(schema.getIndexAnalyzer(), null); CharTermAttribute termAttribute = ts.getAttribute(CharTermAttribute.class); ts.reset();//from ww w .j a v a 2s. c o m while (ts.incrementToken()) { String token = new String(termAttribute.buffer(), 0, termAttribute.length()); tokens.add(token); } ts.end(); ts.close(); return tokens; }