Example usage for org.apache.commons.lang3.text StrTokenizer setDelimiterString

List of usage examples for org.apache.commons.lang3.text StrTokenizer setDelimiterString

Introduction

In this page you can find the example usage for org.apache.commons.lang3.text StrTokenizer setDelimiterString.

Prototype

public StrTokenizer setDelimiterString(final String delim) 

Source Link

Document

Sets the field delimiter string.

Usage

From source file:com.mgmtp.jfunk.core.util.CsvDataProcessor.java

/**
 * Processes the specified CSV file. For every line but the header line (which is required), the
 * specified command is executed./*from   w ww  . j a  va2  s. c o  m*/
 * 
 * @param reader
 *            the reader for loading the CSV data
 * @param delimiter
 *            the column separator
 * @param quoteChar
 *            the quote character ('\0' for no quoting)
 * @param command
 *            the command (i. e. a Groovy closure if used in a Groovy script) to be executed for
 *            every processed line
 */
public void processFile(final Reader reader, final String delimiter, final char quoteChar,
        final Runnable command) {
    try {
        List<String> inputLines = CharStreams.readLines(reader);

        StrTokenizer st = StrTokenizer.getCSVInstance();
        st.setDelimiterString(delimiter);
        if (quoteChar != '\0') {
            st.setQuoteChar(quoteChar);
        } else {
            st.setQuoteMatcher(StrMatcher.noneMatcher());
        }

        // extract header
        String headerLine = inputLines.remove(0);
        List<Column> columns = initColumns(st, headerLine);
        for (String line : inputLines) {
            st.reset(line);
            String[] colArray = st.getTokenArray();
            int len = colArray.length;
            checkState(len == columns.size(),
                    "Mismatch between number of header columns and number of line columns.");

            DataSource dataSource = dataSourceProvider.get();
            Configuration config = configProvider.get();
            for (int i = 0; i < len; ++i) {
                String value = StringUtils.trimToEmpty(colArray[i]);

                String dataSetKey = columns.get(i).dataSetKey;
                String key = columns.get(i).key;
                if (dataSetKey != null) {
                    if ("<auto>".equals(value)) {
                        dataSource.resetFixedValue(dataSetKey, key);
                    } else {
                        log.debug("Setting data set entry for " + this + " to value=" + value);
                        dataSource.setFixedValue(dataSetKey, key, value);
                    }
                } else {
                    log.debug("Setting property for " + this + " to value=" + value);
                    config.put(key, value);
                }
            }

            command.run();
        }
    } catch (IOException ex) {
        throw new JFunkException("Error processing CSV data", ex);
    }
}

From source file:org.gbif.file.StrTokenizerPerformance.java

@Test
public void testCharVsStringPerformance() throws IOException {
    File source = FileUtils.getClasspathFile("irmng.tail");

    // test CHAR// ww w . j a va 2 s.c  om
    StrTokenizer tokenizer = new StrTokenizer();
    tokenizer.setDelimiterChar('\t');
    tokenizer.setEmptyTokenAsNull(true);
    tokenizer.setIgnoreEmptyTokens(false);
    long time = test(tokenizer, source);
    System.out.println(time + " milliseconds for CHAR based tokenizer.");

    // test STRING
    tokenizer = new StrTokenizer();
    tokenizer.setDelimiterString("\t");
    tokenizer.setEmptyTokenAsNull(true);
    time = test(tokenizer, source);
    System.out.println(time + " milliseconds for STRING based tokenizer.");
}

From source file:org.gbif.file.StrTokenizerTest.java

@Test
public void testCsvQuoted() throws IOException {
    StrTokenizer tokenizer = new StrTokenizer();
    tokenizer.setDelimiterString(",");
    tokenizer.setQuoteChar('"');
    tokenizer.setEmptyTokenAsNull(true);
    tokenizer.setIgnoreEmptyTokens(false);

    tokenizer.reset("121,432423, 9099053,\"Frieda karla L.,DC.\",Ahrens");
    String[] columns = tokenizer.getTokenArray();
    assertEquals("121", columns[0]);
    assertEquals("432423", columns[1]);
    assertEquals(" 9099053", columns[2]);
    assertEquals("Frieda karla L.,DC.", columns[3]);
    assertEquals("Ahrens", columns[4]);

    tokenizer.reset("   ,4321");
    columns = tokenizer.getTokenArray();
    assertEquals("   ", columns[0]);
    assertEquals("4321", columns[1]);

    tokenizer.reset(" ,,,,zzz  ");
    columns = tokenizer.getTokenArray();
    assertEquals(" ", columns[0]);
    assertNull(columns[1]);// w ww .j  a  v  a2s .c om
    assertNull(columns[2]);
    assertNull(columns[3]);
    assertEquals("zzz  ", columns[4]);

    tokenizer.reset(",,,,zzz  ");
    columns = tokenizer.getTokenArray();
    assertNull(columns[0]);
    assertNull(columns[1]);
    assertNull(columns[2]);
    assertNull(columns[3]);
    assertEquals("zzz  ", columns[4]);
}

From source file:org.gbif.file.StrTokenizerTest.java

@Test
public void testCsvUnquoted() throws IOException {
    StrTokenizer tokenizer = new StrTokenizer();
    tokenizer.setDelimiterString(",");
    tokenizer.setEmptyTokenAsNull(true);
    tokenizer.setIgnoreEmptyTokens(false);

    tokenizer.reset("121,432423, 9099053,Frieda karla L.,DC.,Ahrens");
    String[] columns = tokenizer.getTokenArray();
    assertEquals("121", columns[0]);
    assertEquals("432423", columns[1]);
    assertEquals(" 9099053", columns[2]);
    assertEquals("Frieda karla L.", columns[3]);
    assertEquals("DC.", columns[4]);
    assertEquals("Ahrens", columns[5]);

    tokenizer.reset(",,,,zzz  ");
    columns = tokenizer.getTokenArray();
    assertNull(columns[0]);//w w  w.  j av  a 2  s  .  c o m
    assertNull(columns[1]);
    assertNull(columns[2]);
    assertNull(columns[3]);
    assertEquals("zzz  ", columns[4]);
}

From source file:org.gbif.file.StrTokenizerTest.java

@Test
public void testTabQuoted() throws IOException {
    StrTokenizer tokenizer = new StrTokenizer();
    tokenizer.setDelimiterString("\t");
    tokenizer.setQuoteChar('"');
    tokenizer.setEmptyTokenAsNull(true);
    tokenizer.setIgnoreEmptyTokens(false);

    tokenizer.reset("121\t432423\t 9099053\t\"Frieda karla L.,DC.\"\tAhrens");
    String[] columns = tokenizer.getTokenArray();
    assertEquals("121", columns[0]);
    assertEquals("432423", columns[1]);
    assertEquals(" 9099053", columns[2]);
    assertEquals("Frieda karla L.,DC.", columns[3]);
    assertEquals("Ahrens", columns[4]);

    tokenizer.reset("   \t4321");
    columns = tokenizer.getTokenArray();
    assertEquals("   ", columns[0]);
    assertEquals("4321", columns[1]);

    tokenizer.reset(" \t\t\t\tzzz  ");
    columns = tokenizer.getTokenArray();
    assertEquals(" ", columns[0]);
    assertNull(columns[1]);/*from  w  ww  .j a  v  a2  s.co m*/
    assertNull(columns[2]);
    assertNull(columns[3]);
    assertEquals("zzz  ", columns[4]);

    tokenizer.reset("\t\t\t\tzzz  ");
    columns = tokenizer.getTokenArray();
    assertNull(columns[0]);
    assertNull(columns[1]);
    assertNull(columns[2]);
    assertNull(columns[3]);
    assertEquals("zzz  ", columns[4]);
}

From source file:org.gbif.file.StrTokenizerTest.java

@Test
public void testTabUnquoted() throws IOException {
    StrTokenizer tokenizer = new StrTokenizer();
    tokenizer.setDelimiterString("\t");
    tokenizer.setEmptyTokenAsNull(true);
    tokenizer.setIgnoreEmptyTokens(false);

    tokenizer.reset("121\t432423\t 9099053\t\"Frieda karla L.,DC.\"\tAhrens");
    String[] columns = tokenizer.getTokenArray();
    assertEquals("121", columns[0]);
    assertEquals("432423", columns[1]);
    assertEquals(" 9099053", columns[2]);
    assertEquals("\"Frieda karla L.,DC.\"", columns[3]);
    assertEquals("Ahrens", columns[4]);

    tokenizer.reset("   \t4321");
    columns = tokenizer.getTokenArray();
    assertEquals("   ", columns[0]);
    assertEquals("4321", columns[1]);

    tokenizer.reset(" \t\t\t\tzzz  ");
    columns = tokenizer.getTokenArray();
    assertEquals(" ", columns[0]);
    assertNull(columns[1]);// w  w  w. ja va2 s . co  m
    assertNull(columns[2]);
    assertNull(columns[3]);
    assertEquals("zzz  ", columns[4]);

    tokenizer.reset("\t\t\t\tzzz  ");
    columns = tokenizer.getTokenArray();
    assertNull(columns[0]);
    assertNull(columns[1]);
    assertNull(columns[2]);
    assertNull(columns[3]);
    assertEquals("zzz  ", columns[4]);
}