List of usage examples for org.apache.commons.lang.text StrTokenizer getTSVInstance
public static StrTokenizer getTSVInstance()
From source file:it.drwolf.ridire.session.async.Mapper.java
@SuppressWarnings("unchecked") public static Integer countWordsFromPoSTagResource(String posTagResourceFileName) throws IOException { List<String> lines = FileUtils.readLines(new File(posTagResourceFileName)); Integer count = 0;//from w ww .ja v a 2 s .co m StrTokenizer tokenizer = StrTokenizer.getTSVInstance(); for (String l : lines) { tokenizer.reset(l); String[] tokens = tokenizer.getTokenArray(); if (tokens.length == 3) { if (Mapper.isValidPos(tokens[1].trim())) { ++count; } } } return count; }
From source file:it.drwolf.ridire.session.async.WordCounter.java
public Integer countWordsFromPoSTagResource(File posTagResourceFile) throws IOException { List<String> lines = FileUtils.readLines(posTagResourceFile); Integer count = 0;// ww w . jav a2s . c om StrTokenizer tokenizer = StrTokenizer.getTSVInstance(); for (String l : lines) { tokenizer.reset(l); String[] tokens = tokenizer.getTokenArray(); if (tokens.length == 3) { if (this.isValidPos(tokens[1].trim())) { ++count; } } } return count; }
From source file:it.drwolf.ridire.session.JobManager.java
public void retrievePoSText(CrawledResource cr) { File resourceDir = new File( FilenameUtils.getFullPath(cr.getArcFile().replaceAll("__\\d+", "")) + JobManager.RESOURCESDIR); File posTextFile = new File(resourceDir, cr.getDigest() + ".txt.pos"); List<PoSLine> posLines = new ArrayList<PoSLine>(); try {// w w w . j a v a 2 s. c o m List<String> lines = FileUtils.readLines(posTextFile); StrTokenizer tokenizer = StrTokenizer.getTSVInstance(); for (String l : lines) { tokenizer.reset(l); String[] tokens = tokenizer.getTokenArray(); if (tokens.length == 3) { PoSLine poSLine = new PoSLine(); poSLine.setForm(tokens[0].trim()); poSLine.setPosTag(tokens[1].trim()); poSLine.setLemma(tokens[2].trim()); posLines.add(poSLine); } } } catch (IOException e) { } this.setPosText(posLines); }