List of usage examples for weka.filters.unsupervised.attribute StringToWordVector setOutputWordCounts
public void setOutputWordCounts(boolean outputWordCounts)
From source file:com.hack23.cia.service.impl.action.user.wordcount.WordCounterImpl.java
License:Apache License
@Override public Map<String, Integer> calculateWordCount(final DocumentContentData documentContentData, final int maxResult) { final String html = documentContentData.getContent(); final Attribute input = new Attribute("html", (ArrayList<String>) null); final ArrayList<Attribute> inputVec = new ArrayList<>(); inputVec.add(input);/*ww w .java2 s . co m*/ final Instances htmlInst = new Instances("html", inputVec, 1); htmlInst.add(new DenseInstance(1)); htmlInst.instance(0).setValue(0, html); final StopwordsHandler StopwordsHandler = new StopwordsHandler() { @Override public boolean isStopword(final String word) { return word.length() < 5; } }; final NGramTokenizer tokenizer = new NGramTokenizer(); tokenizer.setNGramMinSize(1); tokenizer.setNGramMaxSize(1); tokenizer.setDelimiters(" \r\n\t.,;:'\"()?!'"); final StringToWordVector filter = new StringToWordVector(); filter.setTokenizer(tokenizer); filter.setStopwordsHandler(StopwordsHandler); filter.setLowerCaseTokens(true); filter.setOutputWordCounts(true); filter.setWordsToKeep(maxResult); final Map<String, Integer> result = new HashMap<>(); try { filter.setInputFormat(htmlInst); final Instances dataFiltered = Filter.useFilter(htmlInst, filter); final Instance last = dataFiltered.lastInstance(); final int numAttributes = last.numAttributes(); for (int i = 0; i < numAttributes; i++) { result.put(last.attribute(i).name(), Integer.valueOf(last.toString(i))); } } catch (final Exception e) { LOGGER.warn("Problem calculating wordcount for : {} , exception:{}", documentContentData.getId(), e); } return result; }
From source file:nl.uva.expose.classification.WekaClassification.java
private void getWordVector(Instances dRaw, Instances dFiltered) throws Exception { StringToWordVector filter = new StringToWordVector(); filter.setAttributeIndices("first-last"); filter.setIDFTransform(true);// w w w. j a va 2 s. c om filter.setLowerCaseTokens(true); filter.setMinTermFreq(2); filter.setLowerCaseTokens(true); filter.setNormalizeDocLength( new SelectedTag(StringToWordVector.FILTER_NORMALIZE_ALL, StringToWordVector.TAGS_FILTER)); filter.setOutputWordCounts(true); // filter.setTokenizer(); // filter.setWordsToKeep(); filter.setInputFormat(dRaw); dFiltered = Filter.useFilter(dRaw, filter); }
From source file:org.montp2.m1decol.ter.gramms.filters.FilterTokenizerBoolean.java
License:Open Source License
public void indexingToTokenizer(String inPath, String outPath) throws Exception { WordTokenizer wordTokenizer = new WordTokenizer(); wordTokenizer.setDelimiters("\r \t.,;:'\"()?!"); Instances inputInstances = WekaUtils.loadARFF(inPath); StringToWordVector filter = new StringToWordVector(); filter.setInputFormat(inputInstances); filter.setDoNotOperateOnPerClassBasis(false); filter.setInvertSelection(false);//from w ww.ja va2 s. c o m filter.setLowerCaseTokens(true); filter.setOutputWordCounts(false); filter.setTokenizer(wordTokenizer); filter.setUseStoplist(true); filter.setWordsToKeep(wordsTokeep); Instances outputInstances = Filter.useFilter(inputInstances, filter); OutputStreamUtils.writeSimple(outputInstances.toString(), outPath); }
From source file:org.montp2.m1decol.ter.gramms.filters.FilterTokenizerIDFT.java
License:Open Source License
public void indexingToTokenizer(String inPath, String outPath) throws Exception { WordTokenizer wordTokenizer = new WordTokenizer(); wordTokenizer.setDelimiters("\r \t.,;:'\"()?!"); Instances inputInstances = WekaUtils.loadARFF(inPath); StringToWordVector filter = new StringToWordVector(); filter.setInputFormat(inputInstances); filter.setIDFTransform(true);/*w ww . ja va 2 s . co m*/ filter.setTFTransform(true); filter.setDoNotOperateOnPerClassBasis(false); filter.setInvertSelection(false); filter.setLowerCaseTokens(true); filter.setMinTermFreq(3); filter.setOutputWordCounts(true); filter.setTokenizer(wordTokenizer); filter.setUseStoplist(true); filter.setWordsToKeep(200); Instances outputInstances = Filter.useFilter(inputInstances, filter); OutputStreamUtils.writeSimple(outputInstances.toString(), outPath); }
From source file:org.montp2.m1decol.ter.gramms.filters.FilterTokenizerVector.java
License:Open Source License
public void indexingToTokenizer(String inPath, String outPath) throws Exception { WordTokenizer wordTokenizer = new WordTokenizer(); wordTokenizer.setDelimiters("\r \t.,;:'\"()?!"); Instances inputInstances = WekaUtils.loadARFF(inPath); StringToWordVector filter = new StringToWordVector(); filter.setInputFormat(inputInstances); filter.setDoNotOperateOnPerClassBasis(false); filter.setInvertSelection(false);// w ww. ja v a 2 s .c o m filter.setLowerCaseTokens(true); filter.setMinTermFreq(3); filter.setOutputWordCounts(true); filter.setTokenizer(wordTokenizer); filter.setUseStoplist(true); filter.setWordsToKeep(200); Instances outputInstances = Filter.useFilter(inputInstances, filter); OutputStreamUtils.writeSimple(outputInstances.toString(), outPath); }