List of usage examples for org.apache.lucene.analysis TokenStream reflectAsString
public final String reflectAsString(final boolean prependAttClass)
From source file:nmsu.cs.DocParser.java
License:Open Source License
/** * /*from www. j av a 2s . co m*/ * @param docs * @param pub2bugsIds * @param dim1size * @param dim2size * @return * @throws IOException */ public static Map<Integer, Map<Integer, Map<Integer, Double>>> calculateAspectDocumentMap( Map<Integer, Doc> docs, Map<String, Integer> vocabulary, List<String> vocabInverse, BidiMap pub2bugsIds, final int dim1size, final int dim2size, final int dim3size) { // double[][][] apsectDocMatrix = new double[dim1size][dim2size][dim3size]; //word -> aspect -> document -> occur number Map<Integer, Map<Integer, Map<Integer, Double>>> w2a2d2num = new HashMap<Integer, Map<Integer, Map<Integer, Double>>>(); try { for (Object pubid_ : pub2bugsIds.keySet()) { int pubid = (Integer) pubid_; //document id int bugid = getPubid2BugsId(pubid, pub2bugsIds); Doc doc = docs.get(pubid); List<String> textList = doc.getText(); //aspect id for (int i = 0; i < textList.size(); i++) { //TODO:: remove URL, @XXX and non-ASCII characters //DONE String text = textList.get(i); //remove url String t1 = text.replaceAll( "(https?|ftp|file)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]", ""); //remove non-ascii String t2 = t1.replaceAll("[^\\x00-\\x7F]", ""); // stemmed text token stream TokenStream tokenStream = getStemmer().tokenStream("", new StringReader(t2)); while (tokenStream.incrementToken()) { String token = tokenStream.reflectAsString(false); int termBegin = 5; int termEnd = token.indexOf(","); final String wordstem = token.substring(termBegin, termEnd); //w id Integer vocabIndex = vocabulary.get(wordstem); if (vocabIndex != null) { //instance.add(vocabIndex, bugid, 1.0); // apsectDocMatrix[vocabIndex][i][bugid] +=1.0; //initialize w2a2d2count map Double oldValue = Util.get3Map(w2a2d2num, vocabIndex, i, bugid); Util.update3Map(w2a2d2num, vocabIndex, i, bugid, oldValue + 1); } } } } } catch (IOException e) { throw new RuntimeException(e); } return w2a2d2num; }
From source file:nmsu.cs.DocParser.java
License:Open Source License
public static double[][][] calculateAspectDocumentMatrix(Map<Integer, Doc> docs, Map<String, Integer> vocabulary, List<String> vocabInverse, BidiMap pub2bugsIds, final int dim1size, final int dim2size, final int dim3size) { double[][][] apsectDocMatrix = new double[dim1size][dim2size][dim3size]; //word -> aspect -> document -> occur number try {//from ww w .j a v a2 s . c o m for (Object pubid_ : pub2bugsIds.keySet()) { int pubid = (Integer) pubid_; //document id int bugid = getPubid2BugsId(pubid, pub2bugsIds); Doc doc = docs.get(pubid); List<String> textList = doc.getText(); //aspect id for (int i = 0; i < textList.size(); i++) { //TODO:: remove URL, @XXX and non-ASCII characters //DONE String text = textList.get(i); //remove url String t1 = text.replaceAll( "(https?|ftp|file)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]", ""); //remove non-ascii String t2 = t1.replaceAll("[^\\x00-\\x7F]", ""); // stemmed text token stream TokenStream tokenStream = getStemmer().tokenStream("", new StringReader(t2)); // for (Token tok = tokenStream.next(); tok != null; tok = tokenStream.next()) { while (tokenStream.incrementToken()) { String token = tokenStream.reflectAsString(false); int termBegin = 5; int termEnd = token.indexOf(","); final String wordstem = token.substring(termBegin, termEnd); //w id Integer vocabIndex = vocabulary.get(wordstem); if (vocabIndex != null) { //instance.add(vocabIndex, bugid, 1.0); apsectDocMatrix[vocabIndex][i][bugid] += 1.0; } } } } } catch (IOException e) { throw new RuntimeException(e); } return apsectDocMatrix; }
From source file:nmsu.cs.DocParser.java
License:Open Source License
/** * /*from www .j a v a 2s . c om*/ * @param docs * @param vocabulary * @param vocabInverse * @param pub2bugsIds * @param dim1size * @param dim2size * @return */ public static double[][] calculateWordDocumentMatrix(Map<Integer, Doc> docs, Map<String, Integer> vocabulary, List<String> vocabInverse, BidiMap pub2bugsIds, final int dim1size, final int dim2size) { double[][] wordDocMatrix = new double[dim1size][dim2size]; try { assert (dim1size == vocabInverse.size()) : "dimensions do not match in dim 1: dim1size=" + dim1size; //assert (dim2size == Collections.<Integer>max(pub2bugsIds.values()) + 1) : // "max of pub2bugsIds does not match in dim 2: dim1size=" + dim2size + // " Collections.<Integer>max(pub2bugsIds.values()=" + Collections.<Integer>max(pub2bugsIds.values()); for (Object pubid_ : pub2bugsIds.keySet()) { int pubid = (Integer) pubid_; int bugid = getPubid2BugsId(pubid, pub2bugsIds); Doc doc = docs.get(pubid); String text = doc.getFullText(); assert (text.length() > 1); //remove url String t1 = text .replaceAll("(https?|ftp|file)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]", ""); //remove non-ascii String t2 = t1.replaceAll("[^\\x00-\\x7F]", ""); TokenStream tokenStream = getStemmer().tokenStream("", new StringReader(t2)); // for (Token tok = tokenStream.next(); tok != null; tok = tokenStream.next()) { // final String wordstem = tok.termText(); while (tokenStream.incrementToken()) { String token = tokenStream.reflectAsString(false); int termBegin = 5; int termEnd = token.indexOf(","); final String wordstem = token.substring(termBegin, termEnd); Integer vocabIndex = vocabulary.get(wordstem); if (vocabIndex != null) { //instance.add(vocabIndex, bugid, 1.0); wordDocMatrix[vocabIndex][bugid] += 1.0; } } } return wordDocMatrix; } catch (IOException e) { throw new RuntimeException(e); } }
From source file:nmsu.cs.DocParser.java
License:Open Source License
public static Map<Integer, Map<Integer, Double>> calculateWordDocumentMap(Map<Integer, Doc> docs, Map<String, Integer> vocabulary, List<String> vocabInverse, BidiMap pub2bugsIds, final int dim1size, final int dim2size) { Map<Integer, Map<Integer, Double>> wordDocMap = new HashMap<Integer, Map<Integer, Double>>(); try {/*from w ww. jav a2 s . c om*/ assert (dim1size == vocabInverse.size()) : "dimensions do not match in dim 1: dim1size=" + dim1size; //assert (dim2size == Collections.<Integer>max(pub2bugsIds.values()) + 1) : // "max of pub2bugsIds does not match in dim 2: dim1size=" + dim2size + // " Collections.<Integer>max(pub2bugsIds.values()=" + Collections.<Integer>max(pub2bugsIds.values()); for (Object pubid_ : pub2bugsIds.keySet()) { int pubid = (Integer) pubid_; int bugid = getPubid2BugsId(pubid, pub2bugsIds); Doc doc = docs.get(pubid); String text = doc.getFullText(); assert (text.length() > 1); //remove url String t1 = text .replaceAll("(https?|ftp|file)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]", ""); //remove non-ascii String t2 = t1.replaceAll("[^\\x00-\\x7F]", ""); TokenStream tokenStream = getStemmer().tokenStream("", new StringReader(t2)); // for (Token tok = tokenStream.next(); tok != null; tok = tokenStream.next()) { // final String wordstem = tok.termText(); while (tokenStream.incrementToken()) { String token = tokenStream.reflectAsString(false); int termBegin = 5; int termEnd = token.indexOf(","); final String wordstem = token.substring(termBegin, termEnd); Integer vocabIndex = vocabulary.get(wordstem); if (vocabIndex != null) { //instance.add(vocabIndex, bugid, 1.0); Double oldValue = Util.get2Map(wordDocMap, vocabIndex, bugid); Util.update2Map(wordDocMap, vocabIndex, bugid, oldValue + 1.0); } } } return wordDocMap; } catch (IOException e) { throw new RuntimeException(e); } }
From source file:nmsu.cs.DocParser.java
License:Open Source License
/** * Huiping noted 2012-02-29/* w w w . j a v a 2 s .c o m*/ * * Extract vocabularies from all documents and Build TF array, DF array * */ public void createVocabulary(Map<Integer, Doc> docs, int minWordsOccurence, //Map<String, Integer> vocabularyMap, List<String> vocabInverse) { vocabularyMap.clear(); docInverse = hashDocs(new ArrayList<Doc>(docs.values())); int corpus_length = 0; try { // calculate origVocabulary //Map from vocabularity to its index position origVocabulary = new HashMap<String, Integer>(); //word ordered by word index origVocabInverse = new ArrayList<String>(); //list of documents containing word, ordered by word index setOrigVocabInverse2Doc(new ArrayList<List<Integer>>()); // each element holds a list of the frequency with which a word occurs in the document. // order is the same as above origVocabInverse2DocFrequency = new ArrayList<List<Integer>>(); assert (docs.size() > 0); for (int pubid : docs.keySet()) { Doc doc = docs.get(pubid); String text = doc.getFullText(); assert (text.length() > 1); //remove url String t1 = text .replaceAll("(https?|ftp|file)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]", ""); //remove non-ascii String t2 = t1.replaceAll("[^\\x00-\\x7F]", ""); TokenStream tokenStream = getStemmer().tokenStream("", new StringReader(t2)); // for (Token tok = tokenStream.next(); tok != null; tok = tokenStream.next()) { // final String wordstem = tok.termText(); while (tokenStream.incrementToken()) { corpus_length++; String token = tokenStream.reflectAsString(false); int termBegin = 5; int termEnd = token.indexOf(","); final String wordstem = token.substring(termBegin, termEnd); // get the vocab index Integer vocabIndex = origVocabulary.get(wordstem); if (vocabIndex == null) { vocabIndex = origVocabulary.size(); origVocabulary.put(wordstem, vocabIndex); // hash origVocabInverse.add(wordstem); // add to list getOrigVocabInverse2Doc().add(new ArrayList<Integer>(Collections.singleton(pubid))); // add to document hash origVocabInverse2DocFrequency.add(new ArrayList<Integer>(Collections.singleton(1))); // add to document hash assert (vocabIndex == origVocabInverse.size() - 1) : "vocabIndex = " + vocabIndex + " origVocabInverse.size()-1=" + (origVocabInverse.size() - 1); assert (vocabIndex == getOrigVocabInverse2Doc().size() - 1) : "vocabIndex =" + vocabIndex + " origVocabInverse2Doc.size()-1=" + (getOrigVocabInverse2Doc().size() - 1); assert (getOrigVocabInverse2Doc().get(vocabIndex) .size() == 1) : "origVocabInverse2Doc.get(vocabIndex).size()=" + getOrigVocabInverse2Doc().get(vocabIndex).size(); assert (origVocabInverse2DocFrequency.get(vocabIndex) .size() == 1) : "origVocabInverse2DocFrequency.get(vocabIndex).size()=" + origVocabInverse2DocFrequency.get(vocabIndex).size(); } else { assert (origVocabInverse.get(origVocabulary.get(wordstem)) .equals(wordstem)) : "Inverse vocabulary broken. wordstem=" + wordstem + " vocabulary.get(wordstem)=" + origVocabulary.get(wordstem) + " origVocabInverse.get(" + origVocabulary.get(wordstem) + ")=" + origVocabInverse.get(origVocabulary.get(wordstem)) + ")"; // add 1 to the frequencyList of (vocabIndex,doc) List<Integer> doclist = getOrigVocabInverse2Doc().get(vocabIndex); List<Integer> freqlist = origVocabInverse2DocFrequency.get(vocabIndex); int docindex1 = doclist.indexOf(pubid); if (docindex1 < 0) { docindex1 = doclist.size(); doclist.add(pubid); } while (docindex1 >= freqlist.size()) { freqlist.add(0); } int docindex = docindex1; final int oldFreq = freqlist.get(docindex); freqlist.set(docindex, oldFreq + 1); assert (getOrigVocabInverse2Doc().size() == origVocabInverse2DocFrequency.size()); assert (origVocabInverse2DocFrequency.get(vocabIndex).get(docindex) > 0); } } } // calculate (real) vocabulary (use by the inference algorithm) // throw out words that occur only within one document (and thus achieve no coupling) assert (getOrigVocabInverse2Doc().size() > 1); assert (origVocabInverse.size() > 1); //vocabulary = new HashMap<String, Integer>(); //vocabInverse = new ArrayList<String>(); for (int w = 0; w < getOrigVocabInverse2Doc().size(); w++) { if (getOrigVocabInverse2Doc().get(w).size() >= minWordsOccurence) { int vocaIndex = vocabInverse.size(); vocabInverse.add(origVocabInverse.get(w)); vocabularyMap.put(origVocabInverse.get(w), vocaIndex); } } // System.out.println(Debugger.getCallerPosition()+"vocabInverse (" + vocabInverse.size() + ") " + vocabInverse); // System.out.println(Debugger.getCallerPosition()+"origVocabInverse (" + origVocabInverse.size() + ")" + origVocabInverse); //System.out.println(Debugger.getCallerPosition()+Arrays.toString(vocabInverse.toArray())); assert (vocabInverse.size() > 0); } catch (IOException e) { throw new RuntimeException(e); //todo handle } Constant.tokenNum = vocabularyMap.size(); System.out.println(Debugger.getCallerPosition() + " corpus length is " + corpus_length); }
From source file:org.alfresco.repo.search.impl.lucene.analysis.PathTokenFilterTest.java
License:Open Source License
private void tokenise(TokenStream ts, String[] tokens) throws IOException { int i = 0;/*from w w w . j a v a 2s . c om*/ CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class); try { ts.reset(); while (ts.incrementToken()) { System.out.println("token: " + ts.reflectAsString(true)); String termText = termAtt.toString(); if (typeAtt.type().equals(PathTokenFilter.TOKEN_TYPE_PATH_ELEMENT_NAMESPACE)) { assert (i % 2 == 0); assertEquals(termText, tokens[i++]); } else if (typeAtt.type().equals(PathTokenFilter.TOKEN_TYPE_PATH_ELEMENT_NAMESPACE_PREFIX)) { assert (i % 2 == 0); assertEquals(termText, tokens[i++]); } else if (typeAtt.type().equals(PathTokenFilter.TOKEN_TYPE_PATH_ELEMENT_NAME)) { assert (i % 2 == 1); assertEquals(termText, tokens[i++]); } } ts.end(); } finally { ts.close(); } if (i != tokens.length) { fail("Invalid number of tokens, found " + i + " and expected " + tokens.length); } }
From source file:org.grouplens.samantha.modeler.featurizer.FeatureExtractorUtilities.java
License:Open Source License
static public Map<String, Integer> getTermFreq(Analyzer analyzer, String text, String termField) { TokenStream ts = analyzer.tokenStream(termField, text); Map<String, Integer> termFreq = new HashMap<>(); try {//from www. jav a 2 s .c om ts.reset(); while (ts.incrementToken()) { String term = ts.reflectAsString(false); int cnt = termFreq.getOrDefault(term, 0); termFreq.put(term, cnt + 1); } ts.close(); } catch (IOException e) { logger.error("{}", e.getMessage()); throw new BadRequestException(e); } return termFreq; }