Example usage for org.apache.commons.lang3.text StrTokenizer StrTokenizer

List of usage examples for org.apache.commons.lang3.text StrTokenizer StrTokenizer

Introduction

In this page you can find the example usage for org.apache.commons.lang3.text StrTokenizer StrTokenizer.

Prototype

public StrTokenizer() 

Source Link

Document

Constructs a tokenizer splitting on space, tab, newline and formfeed as per StringTokenizer, but with no text to tokenize.

Usage

From source file:org.gbif.file.CSVReader.java

public CSVReader(InputStream stream, String encoding, String delimiter, Character quotes, Integer headerRows)
        throws IOException {
    Cache<Integer, String> cache = CacheBuilder.newBuilder().maximumSize(1000).build();
    this.emptyLines = cache.asMap();
    this.rows = 0;
    this.readRows = 0;
    this.delimiter = delimiter;
    this.encoding = encoding;
    this.quoteChar = quotes;
    this.headerRows = headerRows == null || headerRows < 0 ? 0 : headerRows;
    tokenizer = new StrTokenizer();
    tokenizer.setDelimiterString(delimiter);
    if (quotes != null) {
        tokenizer.setQuoteChar(quotes);/*from w  ww.j av a  2  s.  c om*/
    }
    tokenizer.setIgnoreEmptyTokens(false);
    tokenizer.reset();
    InputStreamReader reader = new InputStreamReader(stream, encoding);
    br = new BufferedReader(reader);
    row = br.readLine();
    // parse header row
    if (row == null) {
        header = null;
    } else {
        tokenizer.reset(row);
        header = tokenizer.getTokenArray();
    }
    // skip initial header rows?
    while (headerRows > 0) {
        headerRows--;
        row = br.readLine();
    }
}

From source file:org.gbif.file.StrTokenizerPerformance.java

@Test
public void testCharVsStringPerformance() throws IOException {
    File source = FileUtils.getClasspathFile("irmng.tail");

    // test CHAR/*from   w ww. jav a 2  s  .c o m*/
    StrTokenizer tokenizer = new StrTokenizer();
    tokenizer.setDelimiterChar('\t');
    tokenizer.setEmptyTokenAsNull(true);
    tokenizer.setIgnoreEmptyTokens(false);
    long time = test(tokenizer, source);
    System.out.println(time + " milliseconds for CHAR based tokenizer.");

    // test STRING
    tokenizer = new StrTokenizer();
    tokenizer.setDelimiterString("\t");
    tokenizer.setEmptyTokenAsNull(true);
    time = test(tokenizer, source);
    System.out.println(time + " milliseconds for STRING based tokenizer.");
}

From source file:org.gbif.file.StrTokenizerTest.java

@Test
public void testCsvQuoted() throws IOException {
    StrTokenizer tokenizer = new StrTokenizer();
    tokenizer.setDelimiterString(",");
    tokenizer.setQuoteChar('"');
    tokenizer.setEmptyTokenAsNull(true);
    tokenizer.setIgnoreEmptyTokens(false);

    tokenizer.reset("121,432423, 9099053,\"Frieda karla L.,DC.\",Ahrens");
    String[] columns = tokenizer.getTokenArray();
    assertEquals("121", columns[0]);
    assertEquals("432423", columns[1]);
    assertEquals(" 9099053", columns[2]);
    assertEquals("Frieda karla L.,DC.", columns[3]);
    assertEquals("Ahrens", columns[4]);

    tokenizer.reset("   ,4321");
    columns = tokenizer.getTokenArray();
    assertEquals("   ", columns[0]);
    assertEquals("4321", columns[1]);

    tokenizer.reset(" ,,,,zzz  ");
    columns = tokenizer.getTokenArray();
    assertEquals(" ", columns[0]);
    assertNull(columns[1]);//from ww w . j a  v  a  2 s  . c o  m
    assertNull(columns[2]);
    assertNull(columns[3]);
    assertEquals("zzz  ", columns[4]);

    tokenizer.reset(",,,,zzz  ");
    columns = tokenizer.getTokenArray();
    assertNull(columns[0]);
    assertNull(columns[1]);
    assertNull(columns[2]);
    assertNull(columns[3]);
    assertEquals("zzz  ", columns[4]);
}

From source file:org.gbif.file.StrTokenizerTest.java

@Test
public void testCsvUnquoted() throws IOException {
    StrTokenizer tokenizer = new StrTokenizer();
    tokenizer.setDelimiterString(",");
    tokenizer.setEmptyTokenAsNull(true);
    tokenizer.setIgnoreEmptyTokens(false);

    tokenizer.reset("121,432423, 9099053,Frieda karla L.,DC.,Ahrens");
    String[] columns = tokenizer.getTokenArray();
    assertEquals("121", columns[0]);
    assertEquals("432423", columns[1]);
    assertEquals(" 9099053", columns[2]);
    assertEquals("Frieda karla L.", columns[3]);
    assertEquals("DC.", columns[4]);
    assertEquals("Ahrens", columns[5]);

    tokenizer.reset(",,,,zzz  ");
    columns = tokenizer.getTokenArray();
    assertNull(columns[0]);/*from  w ww  . j ava 2s . c o m*/
    assertNull(columns[1]);
    assertNull(columns[2]);
    assertNull(columns[3]);
    assertEquals("zzz  ", columns[4]);
}

From source file:org.gbif.file.StrTokenizerTest.java

@Test
public void testPipes() throws IOException {
    StrTokenizer tokenizer = new StrTokenizer();
    tokenizer.setDelimiterChar('|');
    tokenizer.setQuoteChar('"');
    tokenizer.setEmptyTokenAsNull(true);
    tokenizer.setIgnoreEmptyTokens(false);

    tokenizer.reset("121|432423| 9099053|\"Frieda karla L.|DC.\"|Ahrens");
    String[] columns = tokenizer.getTokenArray();
    assertEquals("121", columns[0]);
    assertEquals("432423", columns[1]);
    assertEquals(" 9099053", columns[2]);
    assertEquals("Frieda karla L.|DC.", columns[3]);
    assertEquals("Ahrens", columns[4]);

    tokenizer.reset("   |4321");
    columns = tokenizer.getTokenArray();
    assertEquals("   ", columns[0]);
    assertEquals("4321", columns[1]);

    tokenizer.reset(" ||||zzz  ");
    columns = tokenizer.getTokenArray();
    assertEquals(" ", columns[0]);
    assertNull(columns[1]);/*from  w w w.  j  a  va  2 s.  c  o m*/
    assertNull(columns[2]);
    assertNull(columns[3]);
    assertEquals("zzz  ", columns[4]);

    tokenizer.reset("||||zzz  ");
    columns = tokenizer.getTokenArray();
    assertNull(columns[0]);
    assertNull(columns[1]);
    assertNull(columns[2]);
    assertNull(columns[3]);
    assertEquals("zzz  ", columns[4]);
}

From source file:org.gbif.file.StrTokenizerTest.java

@Test
public void testTabQuoted() throws IOException {
    StrTokenizer tokenizer = new StrTokenizer();
    tokenizer.setDelimiterString("\t");
    tokenizer.setQuoteChar('"');
    tokenizer.setEmptyTokenAsNull(true);
    tokenizer.setIgnoreEmptyTokens(false);

    tokenizer.reset("121\t432423\t 9099053\t\"Frieda karla L.,DC.\"\tAhrens");
    String[] columns = tokenizer.getTokenArray();
    assertEquals("121", columns[0]);
    assertEquals("432423", columns[1]);
    assertEquals(" 9099053", columns[2]);
    assertEquals("Frieda karla L.,DC.", columns[3]);
    assertEquals("Ahrens", columns[4]);

    tokenizer.reset("   \t4321");
    columns = tokenizer.getTokenArray();
    assertEquals("   ", columns[0]);
    assertEquals("4321", columns[1]);

    tokenizer.reset(" \t\t\t\tzzz  ");
    columns = tokenizer.getTokenArray();
    assertEquals(" ", columns[0]);
    assertNull(columns[1]);/*from   w  w  w  .  ja v  a2s.  co m*/
    assertNull(columns[2]);
    assertNull(columns[3]);
    assertEquals("zzz  ", columns[4]);

    tokenizer.reset("\t\t\t\tzzz  ");
    columns = tokenizer.getTokenArray();
    assertNull(columns[0]);
    assertNull(columns[1]);
    assertNull(columns[2]);
    assertNull(columns[3]);
    assertEquals("zzz  ", columns[4]);
}

From source file:org.gbif.file.StrTokenizerTest.java

@Test
public void testTabUnquoted() throws IOException {
    StrTokenizer tokenizer = new StrTokenizer();
    tokenizer.setDelimiterString("\t");
    tokenizer.setEmptyTokenAsNull(true);
    tokenizer.setIgnoreEmptyTokens(false);

    tokenizer.reset("121\t432423\t 9099053\t\"Frieda karla L.,DC.\"\tAhrens");
    String[] columns = tokenizer.getTokenArray();
    assertEquals("121", columns[0]);
    assertEquals("432423", columns[1]);
    assertEquals(" 9099053", columns[2]);
    assertEquals("\"Frieda karla L.,DC.\"", columns[3]);
    assertEquals("Ahrens", columns[4]);

    tokenizer.reset("   \t4321");
    columns = tokenizer.getTokenArray();
    assertEquals("   ", columns[0]);
    assertEquals("4321", columns[1]);

    tokenizer.reset(" \t\t\t\tzzz  ");
    columns = tokenizer.getTokenArray();
    assertEquals(" ", columns[0]);
    assertNull(columns[1]);// w ww .  j  a v a2  s .c  om
    assertNull(columns[2]);
    assertNull(columns[3]);
    assertEquals("zzz  ", columns[4]);

    tokenizer.reset("\t\t\t\tzzz  ");
    columns = tokenizer.getTokenArray();
    assertNull(columns[0]);
    assertNull(columns[1]);
    assertNull(columns[2]);
    assertNull(columns[3]);
    assertEquals("zzz  ", columns[4]);
}