List of usage examples for org.apache.commons.lang3.text StrTokenizer setDelimiterString
public StrTokenizer setDelimiterString(final String delim)
From source file:com.mgmtp.jfunk.core.util.CsvDataProcessor.java
/** * Processes the specified CSV file. For every line but the header line (which is required), the * specified command is executed./*from w ww . j a va2 s. c o m*/ * * @param reader * the reader for loading the CSV data * @param delimiter * the column separator * @param quoteChar * the quote character ('\0' for no quoting) * @param command * the command (i. e. a Groovy closure if used in a Groovy script) to be executed for * every processed line */ public void processFile(final Reader reader, final String delimiter, final char quoteChar, final Runnable command) { try { List<String> inputLines = CharStreams.readLines(reader); StrTokenizer st = StrTokenizer.getCSVInstance(); st.setDelimiterString(delimiter); if (quoteChar != '\0') { st.setQuoteChar(quoteChar); } else { st.setQuoteMatcher(StrMatcher.noneMatcher()); } // extract header String headerLine = inputLines.remove(0); List<Column> columns = initColumns(st, headerLine); for (String line : inputLines) { st.reset(line); String[] colArray = st.getTokenArray(); int len = colArray.length; checkState(len == columns.size(), "Mismatch between number of header columns and number of line columns."); DataSource dataSource = dataSourceProvider.get(); Configuration config = configProvider.get(); for (int i = 0; i < len; ++i) { String value = StringUtils.trimToEmpty(colArray[i]); String dataSetKey = columns.get(i).dataSetKey; String key = columns.get(i).key; if (dataSetKey != null) { if ("<auto>".equals(value)) { dataSource.resetFixedValue(dataSetKey, key); } else { log.debug("Setting data set entry for " + this + " to value=" + value); dataSource.setFixedValue(dataSetKey, key, value); } } else { log.debug("Setting property for " + this + " to value=" + value); config.put(key, value); } } command.run(); } } catch (IOException ex) { throw new JFunkException("Error processing CSV data", ex); } }
From source file:org.gbif.file.StrTokenizerPerformance.java
@Test public void testCharVsStringPerformance() throws IOException { File source = FileUtils.getClasspathFile("irmng.tail"); // test CHAR// ww w . j a va 2 s.c om StrTokenizer tokenizer = new StrTokenizer(); tokenizer.setDelimiterChar('\t'); tokenizer.setEmptyTokenAsNull(true); tokenizer.setIgnoreEmptyTokens(false); long time = test(tokenizer, source); System.out.println(time + " milliseconds for CHAR based tokenizer."); // test STRING tokenizer = new StrTokenizer(); tokenizer.setDelimiterString("\t"); tokenizer.setEmptyTokenAsNull(true); time = test(tokenizer, source); System.out.println(time + " milliseconds for STRING based tokenizer."); }
From source file:org.gbif.file.StrTokenizerTest.java
@Test public void testCsvQuoted() throws IOException { StrTokenizer tokenizer = new StrTokenizer(); tokenizer.setDelimiterString(","); tokenizer.setQuoteChar('"'); tokenizer.setEmptyTokenAsNull(true); tokenizer.setIgnoreEmptyTokens(false); tokenizer.reset("121,432423, 9099053,\"Frieda karla L.,DC.\",Ahrens"); String[] columns = tokenizer.getTokenArray(); assertEquals("121", columns[0]); assertEquals("432423", columns[1]); assertEquals(" 9099053", columns[2]); assertEquals("Frieda karla L.,DC.", columns[3]); assertEquals("Ahrens", columns[4]); tokenizer.reset(" ,4321"); columns = tokenizer.getTokenArray(); assertEquals(" ", columns[0]); assertEquals("4321", columns[1]); tokenizer.reset(" ,,,,zzz "); columns = tokenizer.getTokenArray(); assertEquals(" ", columns[0]); assertNull(columns[1]);// w ww .j a v a2s .c om assertNull(columns[2]); assertNull(columns[3]); assertEquals("zzz ", columns[4]); tokenizer.reset(",,,,zzz "); columns = tokenizer.getTokenArray(); assertNull(columns[0]); assertNull(columns[1]); assertNull(columns[2]); assertNull(columns[3]); assertEquals("zzz ", columns[4]); }
From source file:org.gbif.file.StrTokenizerTest.java
@Test public void testCsvUnquoted() throws IOException { StrTokenizer tokenizer = new StrTokenizer(); tokenizer.setDelimiterString(","); tokenizer.setEmptyTokenAsNull(true); tokenizer.setIgnoreEmptyTokens(false); tokenizer.reset("121,432423, 9099053,Frieda karla L.,DC.,Ahrens"); String[] columns = tokenizer.getTokenArray(); assertEquals("121", columns[0]); assertEquals("432423", columns[1]); assertEquals(" 9099053", columns[2]); assertEquals("Frieda karla L.", columns[3]); assertEquals("DC.", columns[4]); assertEquals("Ahrens", columns[5]); tokenizer.reset(",,,,zzz "); columns = tokenizer.getTokenArray(); assertNull(columns[0]);//w w w. j av a 2 s . c o m assertNull(columns[1]); assertNull(columns[2]); assertNull(columns[3]); assertEquals("zzz ", columns[4]); }
From source file:org.gbif.file.StrTokenizerTest.java
@Test public void testTabQuoted() throws IOException { StrTokenizer tokenizer = new StrTokenizer(); tokenizer.setDelimiterString("\t"); tokenizer.setQuoteChar('"'); tokenizer.setEmptyTokenAsNull(true); tokenizer.setIgnoreEmptyTokens(false); tokenizer.reset("121\t432423\t 9099053\t\"Frieda karla L.,DC.\"\tAhrens"); String[] columns = tokenizer.getTokenArray(); assertEquals("121", columns[0]); assertEquals("432423", columns[1]); assertEquals(" 9099053", columns[2]); assertEquals("Frieda karla L.,DC.", columns[3]); assertEquals("Ahrens", columns[4]); tokenizer.reset(" \t4321"); columns = tokenizer.getTokenArray(); assertEquals(" ", columns[0]); assertEquals("4321", columns[1]); tokenizer.reset(" \t\t\t\tzzz "); columns = tokenizer.getTokenArray(); assertEquals(" ", columns[0]); assertNull(columns[1]);/*from w ww .j a v a2 s.co m*/ assertNull(columns[2]); assertNull(columns[3]); assertEquals("zzz ", columns[4]); tokenizer.reset("\t\t\t\tzzz "); columns = tokenizer.getTokenArray(); assertNull(columns[0]); assertNull(columns[1]); assertNull(columns[2]); assertNull(columns[3]); assertEquals("zzz ", columns[4]); }
From source file:org.gbif.file.StrTokenizerTest.java
@Test public void testTabUnquoted() throws IOException { StrTokenizer tokenizer = new StrTokenizer(); tokenizer.setDelimiterString("\t"); tokenizer.setEmptyTokenAsNull(true); tokenizer.setIgnoreEmptyTokens(false); tokenizer.reset("121\t432423\t 9099053\t\"Frieda karla L.,DC.\"\tAhrens"); String[] columns = tokenizer.getTokenArray(); assertEquals("121", columns[0]); assertEquals("432423", columns[1]); assertEquals(" 9099053", columns[2]); assertEquals("\"Frieda karla L.,DC.\"", columns[3]); assertEquals("Ahrens", columns[4]); tokenizer.reset(" \t4321"); columns = tokenizer.getTokenArray(); assertEquals(" ", columns[0]); assertEquals("4321", columns[1]); tokenizer.reset(" \t\t\t\tzzz "); columns = tokenizer.getTokenArray(); assertEquals(" ", columns[0]); assertNull(columns[1]);// w w w. ja va2 s . co m assertNull(columns[2]); assertNull(columns[3]); assertEquals("zzz ", columns[4]); tokenizer.reset("\t\t\t\tzzz "); columns = tokenizer.getTokenArray(); assertNull(columns[0]); assertNull(columns[1]); assertNull(columns[2]); assertNull(columns[3]); assertEquals("zzz ", columns[4]); }