List of usage examples for org.apache.commons.lang3.text StrTokenizer StrTokenizer
public StrTokenizer()
From source file:org.gbif.file.CSVReader.java
public CSVReader(InputStream stream, String encoding, String delimiter, Character quotes, Integer headerRows) throws IOException { Cache<Integer, String> cache = CacheBuilder.newBuilder().maximumSize(1000).build(); this.emptyLines = cache.asMap(); this.rows = 0; this.readRows = 0; this.delimiter = delimiter; this.encoding = encoding; this.quoteChar = quotes; this.headerRows = headerRows == null || headerRows < 0 ? 0 : headerRows; tokenizer = new StrTokenizer(); tokenizer.setDelimiterString(delimiter); if (quotes != null) { tokenizer.setQuoteChar(quotes);/*from w ww.j av a 2 s. c om*/ } tokenizer.setIgnoreEmptyTokens(false); tokenizer.reset(); InputStreamReader reader = new InputStreamReader(stream, encoding); br = new BufferedReader(reader); row = br.readLine(); // parse header row if (row == null) { header = null; } else { tokenizer.reset(row); header = tokenizer.getTokenArray(); } // skip initial header rows? while (headerRows > 0) { headerRows--; row = br.readLine(); } }
From source file:org.gbif.file.StrTokenizerPerformance.java
@Test public void testCharVsStringPerformance() throws IOException { File source = FileUtils.getClasspathFile("irmng.tail"); // test CHAR/*from w ww. jav a 2 s .c o m*/ StrTokenizer tokenizer = new StrTokenizer(); tokenizer.setDelimiterChar('\t'); tokenizer.setEmptyTokenAsNull(true); tokenizer.setIgnoreEmptyTokens(false); long time = test(tokenizer, source); System.out.println(time + " milliseconds for CHAR based tokenizer."); // test STRING tokenizer = new StrTokenizer(); tokenizer.setDelimiterString("\t"); tokenizer.setEmptyTokenAsNull(true); time = test(tokenizer, source); System.out.println(time + " milliseconds for STRING based tokenizer."); }
From source file:org.gbif.file.StrTokenizerTest.java
@Test public void testCsvQuoted() throws IOException { StrTokenizer tokenizer = new StrTokenizer(); tokenizer.setDelimiterString(","); tokenizer.setQuoteChar('"'); tokenizer.setEmptyTokenAsNull(true); tokenizer.setIgnoreEmptyTokens(false); tokenizer.reset("121,432423, 9099053,\"Frieda karla L.,DC.\",Ahrens"); String[] columns = tokenizer.getTokenArray(); assertEquals("121", columns[0]); assertEquals("432423", columns[1]); assertEquals(" 9099053", columns[2]); assertEquals("Frieda karla L.,DC.", columns[3]); assertEquals("Ahrens", columns[4]); tokenizer.reset(" ,4321"); columns = tokenizer.getTokenArray(); assertEquals(" ", columns[0]); assertEquals("4321", columns[1]); tokenizer.reset(" ,,,,zzz "); columns = tokenizer.getTokenArray(); assertEquals(" ", columns[0]); assertNull(columns[1]);//from ww w . j a v a 2 s . c o m assertNull(columns[2]); assertNull(columns[3]); assertEquals("zzz ", columns[4]); tokenizer.reset(",,,,zzz "); columns = tokenizer.getTokenArray(); assertNull(columns[0]); assertNull(columns[1]); assertNull(columns[2]); assertNull(columns[3]); assertEquals("zzz ", columns[4]); }
From source file:org.gbif.file.StrTokenizerTest.java
@Test public void testCsvUnquoted() throws IOException { StrTokenizer tokenizer = new StrTokenizer(); tokenizer.setDelimiterString(","); tokenizer.setEmptyTokenAsNull(true); tokenizer.setIgnoreEmptyTokens(false); tokenizer.reset("121,432423, 9099053,Frieda karla L.,DC.,Ahrens"); String[] columns = tokenizer.getTokenArray(); assertEquals("121", columns[0]); assertEquals("432423", columns[1]); assertEquals(" 9099053", columns[2]); assertEquals("Frieda karla L.", columns[3]); assertEquals("DC.", columns[4]); assertEquals("Ahrens", columns[5]); tokenizer.reset(",,,,zzz "); columns = tokenizer.getTokenArray(); assertNull(columns[0]);/*from w ww . j ava 2s . c o m*/ assertNull(columns[1]); assertNull(columns[2]); assertNull(columns[3]); assertEquals("zzz ", columns[4]); }
From source file:org.gbif.file.StrTokenizerTest.java
@Test public void testPipes() throws IOException { StrTokenizer tokenizer = new StrTokenizer(); tokenizer.setDelimiterChar('|'); tokenizer.setQuoteChar('"'); tokenizer.setEmptyTokenAsNull(true); tokenizer.setIgnoreEmptyTokens(false); tokenizer.reset("121|432423| 9099053|\"Frieda karla L.|DC.\"|Ahrens"); String[] columns = tokenizer.getTokenArray(); assertEquals("121", columns[0]); assertEquals("432423", columns[1]); assertEquals(" 9099053", columns[2]); assertEquals("Frieda karla L.|DC.", columns[3]); assertEquals("Ahrens", columns[4]); tokenizer.reset(" |4321"); columns = tokenizer.getTokenArray(); assertEquals(" ", columns[0]); assertEquals("4321", columns[1]); tokenizer.reset(" ||||zzz "); columns = tokenizer.getTokenArray(); assertEquals(" ", columns[0]); assertNull(columns[1]);/*from w w w. j a va 2 s. c o m*/ assertNull(columns[2]); assertNull(columns[3]); assertEquals("zzz ", columns[4]); tokenizer.reset("||||zzz "); columns = tokenizer.getTokenArray(); assertNull(columns[0]); assertNull(columns[1]); assertNull(columns[2]); assertNull(columns[3]); assertEquals("zzz ", columns[4]); }
From source file:org.gbif.file.StrTokenizerTest.java
@Test public void testTabQuoted() throws IOException { StrTokenizer tokenizer = new StrTokenizer(); tokenizer.setDelimiterString("\t"); tokenizer.setQuoteChar('"'); tokenizer.setEmptyTokenAsNull(true); tokenizer.setIgnoreEmptyTokens(false); tokenizer.reset("121\t432423\t 9099053\t\"Frieda karla L.,DC.\"\tAhrens"); String[] columns = tokenizer.getTokenArray(); assertEquals("121", columns[0]); assertEquals("432423", columns[1]); assertEquals(" 9099053", columns[2]); assertEquals("Frieda karla L.,DC.", columns[3]); assertEquals("Ahrens", columns[4]); tokenizer.reset(" \t4321"); columns = tokenizer.getTokenArray(); assertEquals(" ", columns[0]); assertEquals("4321", columns[1]); tokenizer.reset(" \t\t\t\tzzz "); columns = tokenizer.getTokenArray(); assertEquals(" ", columns[0]); assertNull(columns[1]);/*from w w w . ja v a2s. co m*/ assertNull(columns[2]); assertNull(columns[3]); assertEquals("zzz ", columns[4]); tokenizer.reset("\t\t\t\tzzz "); columns = tokenizer.getTokenArray(); assertNull(columns[0]); assertNull(columns[1]); assertNull(columns[2]); assertNull(columns[3]); assertEquals("zzz ", columns[4]); }
From source file:org.gbif.file.StrTokenizerTest.java
@Test public void testTabUnquoted() throws IOException { StrTokenizer tokenizer = new StrTokenizer(); tokenizer.setDelimiterString("\t"); tokenizer.setEmptyTokenAsNull(true); tokenizer.setIgnoreEmptyTokens(false); tokenizer.reset("121\t432423\t 9099053\t\"Frieda karla L.,DC.\"\tAhrens"); String[] columns = tokenizer.getTokenArray(); assertEquals("121", columns[0]); assertEquals("432423", columns[1]); assertEquals(" 9099053", columns[2]); assertEquals("\"Frieda karla L.,DC.\"", columns[3]); assertEquals("Ahrens", columns[4]); tokenizer.reset(" \t4321"); columns = tokenizer.getTokenArray(); assertEquals(" ", columns[0]); assertEquals("4321", columns[1]); tokenizer.reset(" \t\t\t\tzzz "); columns = tokenizer.getTokenArray(); assertEquals(" ", columns[0]); assertNull(columns[1]);// w ww . j a v a2 s .c om assertNull(columns[2]); assertNull(columns[3]); assertEquals("zzz ", columns[4]); tokenizer.reset("\t\t\t\tzzz "); columns = tokenizer.getTokenArray(); assertNull(columns[0]); assertNull(columns[1]); assertNull(columns[2]); assertNull(columns[3]); assertEquals("zzz ", columns[4]); }