Example usage for org.apache.commons.lang StringUtils splitPreserveAllTokens

List of usage examples for org.apache.commons.lang StringUtils splitPreserveAllTokens

Introduction

In this page you can find the example usage for org.apache.commons.lang StringUtils splitPreserveAllTokens.

Prototype

public static String[] splitPreserveAllTokens(String str, String separatorChars) 

Source Link

Document

Splits the provided text into an array, separators specified, preserving all tokens, including empty tokens created by adjacent separators.

Usage

From source file:ubic.gemma.loader.expression.simple.ExperimentalDesignImporterImpl.java

@Override
public void importDesign(ExpressionExperiment experiment, InputStream is, boolean dryRun) throws IOException {
    this.mgedOntologyService = this.ontologyService.getMgedOntologyService();

    log.debug("Parsing input file");
    boolean readHeader = false;

    BufferedReader r = new BufferedReader(new InputStreamReader(is));
    String line = null;//from w  ww . ja  va 2 s .c o  m
    if (mgedOntologyService == null) {
        throw new IllegalStateException("Please set the MGED OntologyService, thanks.");
    }

    ExperimentalDesign experimentalDesign = experiment.getExperimentalDesign();

    if (!experimentalDesign.getExperimentalFactors().isEmpty()) {
        log.warn("Experimental design already has factors, import will add new ones");
    }

    experimentalDesign.setDescription("Parsed from file.");

    List<String> experimentalFactorLines = new ArrayList<String>();
    String sampleHeaderLine = "";
    List<String> factorValueLines = new ArrayList<String>();

    while ((line = r.readLine()) != null) {
        if (line.startsWith(EXPERIMENTAL_FACTOR_DESCRIPTION_LINE_INDICATOR)) {
            experimentalFactorLines.add(line);
        } else if (line.startsWith("#") || StringUtils.isBlank(line)) {
            continue;
        } else if (!readHeader) {
            sampleHeaderLine = line;
            readHeader = true;
        } else {
            factorValueLines.add(line);
        }
    }
    String[] headerFields = StringUtils.splitPreserveAllTokens(sampleHeaderLine, "\t");

    Collection<BioMaterial> experimentBioMaterials = this.bioMaterialService.findByExperiment(experiment);

    validateFileComponents(experimentalFactorLines, sampleHeaderLine, factorValueLines);
    validateExperimentalFactorFileContent(experimentalFactorLines, sampleHeaderLine);
    validateFactorFileContent(experimentalFactorLines.size(), factorValueLines);
    validateBioMaterialFileContent(experiment, experimentBioMaterials, factorValueLines);

    // build up the composite: create experimental factor then add the experimental value
    addExperimentalFactorsToExperimentalDesign(experimentalDesign, experimentalFactorLines, headerFields,
            factorValueLines);

    experimentalDesignService.update(experimentalDesign);

    // a bit tricky as there is an assumption that the first biomaterial in the bioassay set is the relevent one;
    // safer to use biomaterial collection returned; cannot guarantee order of objects in collection.
    Collection<BioMaterial> bioMaterialsWithFactorValues = addFactorValuesToBioMaterialsInExpressionExperiment(
            experiment, experimentBioMaterials, experimentalDesign, factorValueLines, headerFields);

    for (BioMaterial bioMaterial : bioMaterialsWithFactorValues) {
        this.bioMaterialService.update(bioMaterial);

        // just a debugging sanity check.
        BioMaterial bbm = this.bioMaterialService.load(bioMaterial.getId());
        if (log.isDebugEnabled())
            log.debug(bbm + ": " + bbm.getFactorValues().size() + " factor values: "
                    + StringUtils.join(bbm.getFactorValues(), " ; "));
    }

}

From source file:ubic.gemma.loader.expression.simple.ExperimentalDesignImporterImpl.java

/**
 * Add the factor values to the biomaterial
 * /*from w  ww  . ja v  a2 s. c  o m*/
 * @param experiment
 * @param experimentBioMaterials Current expression experiment's biomaterials.
 * @param experimentalDesign experimental design
 * @param factorValueLines Lines from file containing factor values and biomaterial ids
 * @param headerFields
 * @return Collection of biomaterials associated with this experiment, this is returned as the biomaterial is in a
 *         bioassay (first one retrieved)
 */
private Collection<BioMaterial> addFactorValuesToBioMaterialsInExpressionExperiment(
        ExpressionExperiment experiment, Collection<BioMaterial> experimentBioMaterials,
        ExperimentalDesign experimentalDesign, List<String> factorValueLines, String[] headerFields) {
    log.debug("Adding factors values to biomaterials: " + experimentalDesign.getId());
    Collection<ExperimentalFactor> experimentalFactorsInExperiment = experimentalDesign
            .getExperimentalFactors();
    Collection<BioMaterial> biomaterialsWithFactorValuesInExperiment = new HashSet<BioMaterial>();

    Collection<BioMaterial> seenBioMaterials = new HashSet<BioMaterial>();

    Map<ExperimentalFactor, Collection<BioMaterial>> factorsAssociatedWithBioMaterials = new HashMap<ExperimentalFactor, Collection<BioMaterial>>();

    for (String factorValueLine : factorValueLines) {
        String[] factorValueFields = StringUtils.splitPreserveAllTokens(factorValueLine, "\t");

        String externalId = null;
        boolean hasExternalId = headerFields[1].toUpperCase().equals("EXTERNALID");
        if (hasExternalId) {
            externalId = factorValueFields[1];
        }
        BioMaterial currentBioMaterial = getBioMaterialFromExpressionExperiment(experiment,
                experimentBioMaterials, factorValueFields[0], externalId);

        if (currentBioMaterial == null) {
            throw new IllegalStateException("No biomaterial for " + factorValueFields[0]);
        }

        if (seenBioMaterials.contains(currentBioMaterial)) {
            throw new IllegalArgumentException(
                    "A biomaterial occurred more than once in the file: " + currentBioMaterial);
        }

        seenBioMaterials.add(currentBioMaterial);

        int start = 1;
        if (hasExternalId) {
            start = 2;
        }

        for (int i = start; i < factorValueFields.length; i++) {
            ExperimentalFactor currentExperimentalFactor = null;
            String currentExperimentalFactorName = StringUtils.strip(headerFields[i]);

            FactorValue currentFactorValue = null;
            String currentFactorValueValue = StringUtils.strip(factorValueFields[i]);

            if (StringUtils.isBlank(currentFactorValueValue)) {
                // Missing value. Note that catching 'NA' etc. is hard, because they could be valid strings.
                continue;
            }

            for (ExperimentalFactor experimentalFactor : experimentalFactorsInExperiment) {
                if (experimentalFactor.getName().equals(currentExperimentalFactorName)) {
                    currentExperimentalFactor = experimentalFactor;
                }
            }

            if (currentExperimentalFactor == null)
                throw new IllegalStateException("No factor matches column " + currentExperimentalFactorName);

            Collection<FactorValue> factorValuesInCurrentExperimentalFactor = currentExperimentalFactor
                    .getFactorValues();

            for (FactorValue factorValue : factorValuesInCurrentExperimentalFactor) {
                if (factorValue.getValue().trim().equalsIgnoreCase(currentFactorValueValue.trim())) {
                    currentFactorValue = factorValue;
                }
            }

            if (currentFactorValue == null) {
                log.error("Current factor value not found " + currentExperimentalFactor
                        + currentFactorValueValue);
            } else {
                if (!checkForDuplicateFactorOnBioMaterial(currentBioMaterial, currentFactorValue)) {
                    currentBioMaterial.getFactorValues().add(currentFactorValue);
                } else {
                    // already got warned.
                }
            }
            log.debug("Added factor value " + currentFactorValue + " to biomaterial " + currentBioMaterial);
            biomaterialsWithFactorValuesInExperiment.add(currentBioMaterial);

            if (!factorsAssociatedWithBioMaterials.containsKey(currentExperimentalFactor)) {
                factorsAssociatedWithBioMaterials.put(currentExperimentalFactor, new HashSet<BioMaterial>());
            }
            factorsAssociatedWithBioMaterials.get(currentExperimentalFactor).add(currentBioMaterial);

        }

    }

    /*
     * Check if every biomaterial got used. Worth a warning, at least.
     */
    for (ExperimentalFactor ef : factorsAssociatedWithBioMaterials.keySet()) {
        if (!factorsAssociatedWithBioMaterials.get(ef).containsAll(experimentBioMaterials)) {
            log.warn(
                    "File did not contain values for all factor - biomaterial combinations: Missing at least one for "
                            + ef + " [populated " + factorsAssociatedWithBioMaterials.get(ef).size() + "/"
                            + experimentBioMaterials.size() + " ]");
        }
    }

    return biomaterialsWithFactorValuesInExperiment;
}

From source file:ubic.gemma.loader.expression.simple.ExperimentalDesignImporterImpl.java

/**
 * Get a map of experimental values keyed on experimental factor name
 * //from ww w. j av  a2 s.  c  o  m
 * @param headerFields
 * @param factorValueLines
 * @return map of experimental factor values keyed on experimental factor
 */
private Map<String, Set<String>> getMapFactorSampleValues(String[] headerFields,
        List<String> factorValueLines) {
    Map<String, Set<String>> factorSampleValues = new HashMap<String, Set<String>>();
    for (String factorValueLine : factorValueLines) {
        String[] factorValueFields = StringUtils.splitPreserveAllTokens(factorValueLine, "\t");

        for (int i = 1; i < headerFields.length; i++) {

            // get the key
            String value = headerFields[i];
            value = StringUtils.strip(value);
            String factorValue = StringUtils.strip(factorValueFields[i]);
            Set<String> listFactorValues = factorSampleValues.get(value);
            if (listFactorValues == null) {
                listFactorValues = new HashSet<String>();
            }
            listFactorValues.add(factorValue);
            factorSampleValues.put(value, listFactorValues);

        }

    }
    return factorSampleValues;

}

From source file:ubic.gemma.loader.expression.simple.ExperimentalDesignImporterImpl.java

/**
 * Check that the biomaterial is in the file and in the experiment. It is arguable whether this should be an
 * exception. I think it has to be to make sure that simple errors in the format are caught. But it's inconvenient
 * for cases where a single 'design' file is to be used for multiple microarray studies. Biomaterial ids should
 * match what is stored/*  www.ja va  2  s  .c  o m*/
 * 
 * @param experiment Current experiment
 * @param factorValueLines Lines containing biomaterial names and their factor values
 */
private void validateBioMaterialFileContent(ExpressionExperiment experiment,
        Collection<BioMaterial> bioMaterials, List<String> factorValueLines) throws IllegalArgumentException {

    for (String factorValueLine : factorValueLines) {
        String[] vals = StringUtils.splitPreserveAllTokens(factorValueLine, '\t');
        if (vals.length < 2) {
            throw new IllegalArgumentException(
                    "Expected a file with at least two columns separated by tabs, got " + factorValueLine);
        }
        BioMaterial bioMaterialInFile = getBioMaterialFromExpressionExperiment(experiment, bioMaterials,
                vals[0], vals[1]);
        if (bioMaterialInFile == null) {
            throw new IllegalArgumentException(
                    "The uploaded file has a biomaterial name that does not match the study: "
                            + StringUtils.splitPreserveAllTokens(factorValueLine, "\t")[0]
                            + " (formatted based on on input: ");
        }
    }
}

From source file:ubic.gemma.loader.expression.simple.ExperimentalDesignImporterImpl.java

/**
 * Validates that factor values given in file for each biomaterial match the number of experimental factor values
 * expected./*  w  w  w  . j av  a 2  s.  co  m*/
 * 
 * @para numberOfExperimentalFactors
 * @param factorValueList Represents lines of file containing factor values for a biomaterial
 */
private void validateFactorFileContent(Integer numberOfExperimentalFactors, List<String> factorValueList)
        throws IOException {
    for (String factorValueLine : factorValueList) {
        String[] fields = StringUtils.splitPreserveAllTokens(factorValueLine, "\t");
        if (fields.length > numberOfExperimentalFactors + NUMBER_OF_EXTRA_COLUMNS_ALLOWED) {
            throw new IOException(
                    "Expected no more than " + (numberOfExperimentalFactors + NUMBER_OF_EXTRA_COLUMNS_ALLOWED)
                            + " columns based on EF descriptions (plus id column), got " + fields.length);
        }
        if (fields.length <= numberOfExperimentalFactors) {
            throw new IOException("Expected at least " + (numberOfExperimentalFactors + 1)
                    + " columns based on EF descriptions (plus id column), got " + fields.length);

        }
    }
}

From source file:ubic.gemma.loader.expression.simple.ExperimentalDesignImporterImpl.java

/**
 * Validates that the sample header is correctly formatted. Checks that the experimental factors defined in the
 * header match those in the experimental factor file lines.
 * /*  ww  w . j  a  va 2  s.co m*/
 * @param experimentalFactorValueNames
 * @param numberOfExperimentalFactors
 * @param sampleHeaderLine
 * @throws IOException Validation fails.
 */
private void validateSampleHeaderFileContent(Set<String> experimentalFactorValueNames,
        Integer numberOfExperimentalFactors, String sampleHeaderLine) throws IOException {
    String[] headerFields = StringUtils.splitPreserveAllTokens(sampleHeaderLine, "\t");

    // we might have the ids, and the external id.
    if (headerFields.length > numberOfExperimentalFactors + NUMBER_OF_EXTRA_COLUMNS_ALLOWED) {
        throw new IOException("Expected " + (numberOfExperimentalFactors + NUMBER_OF_EXTRA_COLUMNS_ALLOWED)
                + " columns based on EF descriptions (plus id column), got " + headerFields.length);
    }

    for (int i = 1; i < headerFields.length; i++) {

        String value = headerFields[i];

        value = StringUtils.strip(value);

        if (value.equals("ExternalID")) {
            // that's fine.
            continue;
        }

        if (!experimentalFactorValueNames.contains(value)) {
            throw new IOException("Expected to find an EF matching the column heading '" + value + "'");
        }

    }

}

From source file:ubic.gemma.loader.genome.FastaParser.java

/**
 * The following formats are supported//from w  w  w  .j av  a 2 s . c  om
 * <ul>
 * <li>GenBank: gi|gi-number|gb|accession|locus
 * <li>EMBL Data Library : gi|gi-number|emb|accession|locus
 * <li>DDBJ, DNA Database of Japan : gi|gi-number|dbj|accession|locus
 * <li>NBRF PIR : pir||entry
 * <li>Protein Research Foundation : prf||name
 * <li>SWISS-PROT : sp|accession|name
 * <li>Brookhaven Protein Data Bank (1) : pdb|entry|chain
 * <li>Brookhaven Protein Data Bank (2) : entry:chain|PDBID|CHAIN|SEQUENCE
 * <li>Patents : pat|country|number
 * <li>GenInfo Backbone Id bbs|number
 * <li>General database identifier : gnl|database|identifier
 * <li>NCBI Reference Sequence : ref|accession|locus
 * <li>Local Sequence identifier : lcl|identifier
 * <li>NIA 15k and 7k sets : H[0-9A-Z]{1-9}-\d | alternate (example: &gt;H4002F12-5 )
 * <li>Generic: probeid
 * </ul>
 * 
 * @param bioSequence
 * @param header
 * @return
 */
private boolean parseDeflineHeader(BioSequence bioSequence, String header) {
    // one of the genbank formats.
    String[] split = StringUtils.splitPreserveAllTokens(header, "|;");

    String firstTag = split[0];

    // assert firstTag.startsWith( ">" );
    // assert firstTag.length() > 1;
    firstTag = StringUtils.removeStart(firstTag, ">");

    // FIXME check for array lengths, throw illegal argument exceptions.

    if (firstTag.equals("gi")) {
        bioSequence.setDescription(split[4]);
        String genbankAcc = split[3]; // with version number, possibly
        DatabaseEntry genbank = ExternalDatabaseUtils.getGenbankAccession(genbankAcc);
        bioSequence.setName(genbank.getAccession()); // without version number.
        bioSequence.setSequenceDatabaseEntry(genbank);
    } else if (firstTag.equals("pir")) {
        bioSequence.setName(split[1]);
    } else if (firstTag.equals("sp")) {
        bioSequence.setName(split[1]);
        bioSequence.setDescription(split[2]);
    } else if (firstTag.equals("ref")) {
        bioSequence.setName(split[1]);
        bioSequence.setDescription(split[2]);
    } else if (firstTag.equals("lcl")) {
        bioSequence.setName(split[1]);
    } else if (firstTag.equals("pdb")) {
        bioSequence.setName(split[1]);
        bioSequence.setDescription(split[2]);
    } else if (firstTag.equals("gnl")) {
        bioSequence.setName(split[2]);
    } else if (firstTag.equals("entry:chain")) {
        bioSequence.setName(split[1]);
    } else if (firstTag.matches(NIA_HEADER_REGEX)) {
        return parseNIA(bioSequence, header);
    } else {
        // generic.
        bioSequence.setName(split[0]);
        if (split.length > 1)
            bioSequence.setDescription(split[1]);
        // log.warn( "Defline-style FASTA header in unrecognized format, started with " + firstTag );
        // return false;
    }
    return true;
}

From source file:ubic.gemma.loader.genome.gene.ExternalFileGeneLoaderServiceImpl.java

/**
 * Read a gene file line, splitting the line into 3 strings.
 * //from  ww  w.ja va2 s.co  m
 * @param line A line from the gene file
 * @return Array of strings representing a line in a gene file.
 * @throws IOException Thrown if file is not readable
 */
private String[] readLine(String line) throws IOException {
    if (StringUtils.isBlank(line)) {
        return null;
    }
    if (line.startsWith("#")) {
        return null;
    }

    String[] fields = StringUtils.splitPreserveAllTokens(line, '\t');
    if (fields.length < 2) {
        throw new IOException("Illegal format, expected at least 2 columns, got " + fields.length);
    }
    return fields;

}

From source file:ubic.gemma.loader.genome.gene.ncbi.homology.HomologeneServiceImpl.java

/**
 * @param is/*w w  w .jav a  2s  .c  om*/
 * @throws IOException
 */
protected void parseHomologGeneFile(InputStream is) throws IOException {

    BufferedReader br = new BufferedReader(new InputStreamReader(is));
    String line = null;

    while ((line = br.readLine()) != null) {

        if (StringUtils.isBlank(line) || line.startsWith(COMMENT_CHARACTER)) {
            continue;
        }
        String[] fields = StringUtils.splitPreserveAllTokens(line, DELIMITING_CHARACTER);

        Integer taxonId = Integer.parseInt(fields[1]);
        Long groupId;
        Long geneId;
        try {
            groupId = Long.parseLong(fields[0]);
            geneId = Long.parseLong(fields[2]);
        } catch (NumberFormatException e) {
            log.warn("Unparseable line from homologene: " + line);
            continue;
        }
        String geneSymbol = fields[3];

        if (!group2Gene.containsKey(groupId)) {
            group2Gene.put(groupId, new ArrayList<Long>());
        }
        group2Gene.get(groupId).add(geneId);

        if (!gene2Group.containsKey(geneId)) {
            gene2Group.put(geneId, groupId);
        } else {
            log.warn("Duplicate gene ID encountered.  Skipping: geneID=" + geneId + " ,taxonID = " + taxonId
                    + " ,geneSymbol = " + geneSymbol);
        }
    }
    ready.set(true);
    log.info("Gene Homology successfully loaded: " + gene2Group.keySet().size() + " genes covered in "
            + group2Gene.keySet().size() + " groups");

}

From source file:ubic.gemma.loader.genome.gene.ncbi.NcbiGene2AccessionParser.java

@Override
public NCBIGene2Accession parseOneLine(String line) {
    String[] fields = StringUtils.splitPreserveAllTokens(line, '\t');

    if (fields.length != NCBI_GENE2ACCESSION_FIELDS_PER_ROW) {
        throw new IllegalArgumentException("Line is not in the right format: has " + fields.length
                + " fields, expected " + NCBI_GENE2ACCESSION_FIELDS_PER_ROW);
    }//from   w w  w .j a v a  2  s .co m

    NCBIGene2Accession currentAccession = processFields(fields);

    if (currentAccession == null) {
        return null;
    }

    addResult(currentAccession); // really doesn't serve much of a purpose

    /*
     * Only some genes are relevant - for example, we might have filtered them by taxon.
     */
    if (geneInfo != null && !geneInfo.containsKey(currentAccession.getGeneId())) {
        return null;
    }

    // if the current gene Id is different from this current one, then
    // we are done with the gene Id. Push the geneCollection into the queue.
    if (lastGeneId != null && !lastGeneId.equalsIgnoreCase(currentAccession.getGeneId())) {
        // push the gene set to the queue
        try {
            queue.put(geneData);
        } catch (InterruptedException e) {
            throw new RuntimeException(e);
        }
        // clear the gene set
        geneData = new NcbiGeneData();
        if (geneInfo != null)
            geneInfo.remove(lastGeneId);
    }

    assert currentAccession.getGeneId() != null;

    // we're either starting a new one, or continuing with an old one.
    lastGeneId = currentAccession.getGeneId();
    geneData.addAccession(currentAccession);
    geneData.setGeneInfo(geneInfo.get(currentAccession.getGeneId()));

    // this will be a trailing accession.?
    return currentAccession;
}