Example usage for org.apache.commons.lang3 StringUtils splitPreserveAllTokens

List of usage examples for org.apache.commons.lang3 StringUtils splitPreserveAllTokens

Introduction

In this page you can find the example usage for org.apache.commons.lang3 StringUtils splitPreserveAllTokens.

Prototype

public static String[] splitPreserveAllTokens(final String str) 

Source Link

Document

Splits the provided text into an array, using whitespace as the separator, preserving all tokens, including empty tokens created by adjacent separators.

Usage

From source file:ubic.gemma.core.loader.genome.ProbeSequenceParser.java

@Override
public BioSequence parseOneLine(String line) {

    if (line.startsWith(">")) {
        throw new RuntimeException(
                "FASTA format not supported - please use the tabular format for oligonucleotides");
    }/*  ww  w .  j  av  a 2s  . c o m*/

    if (StringUtils.isBlank(line)) {
        return null;
    }

    String[] sArray = StringUtils.splitPreserveAllTokens(line);

    if (sArray.length == 0) {
        return null;
    }

    if (sArray.length != 3) {
        throw new IllegalArgumentException(
                "Expected 3 fields: probe name, sequence name, sequence; line=" + line);
    }

    String probeId = sArray[0].trim();

    if (StringUtils.isBlank(probeId)) {
        return null;
    }

    String sequenceName = sArray[1].trim();

    String sequence = sArray[2].trim();

    // Rarely there are extra junk characters. See bug 2719
    sequence = sequence.replaceAll("[^a-yA-Y]", "");

    // A Adenine
    // C Cytosine
    // G Guanine
    // T Thymine
    // U Uracil
    // R Purine (A or G)
    // Y Pyrimidine (C, T, or U)
    // M C or A
    // K T, U, or G
    // W T, U, or A
    // S C or G
    // B C, T, U, or G (not A)
    // D A, T, U, or G (not C)
    // H A, T, U, or C (not G)
    // V A, C, or G (not T, not U)
    // N Any base (A, C, G, T, or U)

    if (StringUtils.isBlank(sequence)) {
        return null;
    }

    BioSequence seq = BioSequence.Factory.newInstance();
    seq.setSequence(sequence);
    seq.setLength((long) sequence.length());
    seq.setIsCircular(false);
    seq.setIsApproximateLength(false);
    seq.setName(sequenceName);

    if (this.results.containsKey(probeId)) {
        log.warn("Duplicated probe id: " + probeId);
    }
    this.put(probeId, seq);

    return seq;
}