List of usage examples for org.apache.commons.lang StringUtils strip
public static String strip(String str)
Strips whitespace from the start and end of a String.
From source file:ubic.gemma.datastructure.matrix.ExpressionDataWriterUtils.java
/** * Replaces spaces and hyphens with underscores. * //from ww w .j ava2 s.co m * @param factor * @param factorValue * @return */ public static String constructFactorValueName(FactorValue factorValue) { StringBuilder buf = new StringBuilder(); if (factorValue.getCharacteristics().size() > 0) { for (Characteristic c : factorValue.getCharacteristics()) { buf.append(StringUtils.strip(c.getValue())); if (factorValue.getCharacteristics().size() > 1) buf.append(" | "); } } else if (factorValue.getMeasurement() != null) { buf.append(factorValue.getMeasurement().getValue()); } else if (StringUtils.isNotBlank(factorValue.getValue())) { buf.append(StringUtils.strip(factorValue.getValue())); } String matchedFactorValue = buf.toString(); matchedFactorValue = matchedFactorValue.trim(); matchedFactorValue = matchedFactorValue.replaceAll("-", "_"); matchedFactorValue = matchedFactorValue.replaceAll("\\s", "_"); return matchedFactorValue; }
From source file:ubic.gemma.genome.gene.service.GeneSearchServiceImpl.java
/** * Search for multiple genes at once. This attempts to limit the number of genes per query to only one. * /*from ww w .ja va2 s . c o m*/ * @param query A list of gene names (symbols), one per line. * @param taxonId * @return map with each gene-query as a key and a collection of the search-results as the value * @throws IOException */ @Override public Map<String, Collection<GeneValueObject>> searchMultipleGenesGetMap(String query, Long taxonId) throws IOException { Taxon taxon = taxonService.load(taxonId); BufferedReader reader = new BufferedReader(new StringReader(query)); String line = null; int genesAdded = 0; Map<String, Collection<GeneValueObject>> queryToGenes = new HashMap<String, Collection<GeneValueObject>>(); while ((line = reader.readLine()) != null) { line = StringUtils.strip(line); queryToGenes.put(line, new HashSet<GeneValueObject>()); } reader = new BufferedReader(new StringReader(query)); while ((line = reader.readLine()) != null) { if (StringUtils.isBlank(line)) continue; if (genesAdded >= MAX_GENES_PER_QUERY) { log.warn("Too many genes, stopping"); break; } line = StringUtils.strip(line); SearchSettings settings = SearchSettingsImpl.geneSearch(line, taxon); List<SearchResult> geneSearchResults = searchService.search(settings).get(Gene.class); // drops // predicted gene // results // FIXME inform the user (on the client!) if there are some that don't have results. if (geneSearchResults == null || geneSearchResults.isEmpty()) { log.warn("No gene results for gene with id: " + line); } else if (geneSearchResults.size() == 1) { // Just one result so add it Gene g = (Gene) geneSearchResults.iterator().next().getResultObject(); queryToGenes.get(line).add(new GeneValueObject(g)); genesAdded++; } else { // Many results need to find best if possible Collection<Gene> notExactMatch = new HashSet<Gene>(); Collection<GeneValueObject> sameTaxonMatch = new HashSet<GeneValueObject>(); Boolean foundMatch = false; // Usually if there is more than 1 results the search term was a official symbol and picked up matches // like grin1, grin2, grin3, grin (given the search term was grin) for (SearchResult sr : geneSearchResults) { Gene srGene = (Gene) sr.getResultObject(); if (srGene.getOfficialSymbol().equalsIgnoreCase(line)) { queryToGenes.get(line).add(new GeneValueObject(srGene)); genesAdded++; foundMatch = true; break; // found so return } else if (srGene.getTaxon().equals(taxon)) { sameTaxonMatch.add(new GeneValueObject(srGene)); } else notExactMatch.add(srGene); } // if no exact match found add all of them of the same taxon and toss a warning if (!foundMatch) { if (!sameTaxonMatch.isEmpty()) { queryToGenes.get(line).addAll(sameTaxonMatch); log.warn(sameTaxonMatch.size() + " genes found for query id = " + line + ". Genes found are: " + sameTaxonMatch + ". Adding All"); } else { log.warn(notExactMatch.size() + " genes found for query id = " + line + ". Genes found are: " + notExactMatch + ". Adding None"); } } } } return queryToGenes; }
From source file:ubic.gemma.loader.entrez.pubmed.PubMedXMLParser.java
/** * Fill in information about the book: Publisher, Editor(s), Publication year * //from w ww. j a v a 2 s . c o m * @param bibRef * @param record * @return * @throws IOException */ private void processBookRecord(BibliographicReference bibRef, Node record) throws IOException { NodeList recordNodes = record.getChildNodes(); for (int p = 0; p < recordNodes.getLength(); p++) { Node item = recordNodes.item(p); if (!(item instanceof Element)) { continue; } String name = item.getNodeName(); if (name.equals("ArticleTitle")) { // this is the title of the chapter. bibRef.setTitle(StringUtils.strip(XMLUtils.getTextValue((Element) item))); } else if (name.equals("Book")) { processBookInfo(bibRef, item); } else if (name.equals("AuthorList")) { bibRef.setAuthorList(extractAuthorList(item.getChildNodes())); } else if (name.equals("Abstract")) { bibRef.setAbstractText(""); NodeList abstractTextSections = item.getChildNodes(); for (int q = 0; q < abstractTextSections.getLength(); q++) { Node jitem = abstractTextSections.item(q); if (!(jitem instanceof Element)) { continue; } if (jitem.getNodeName().equals("AbstractText")) { bibRef.setAbstractText( bibRef.getAbstractText() + (XMLUtils.getTextValue((Element) jitem)) + " "); } bibRef.setAbstractText(bibRef.getAbstractText().trim()); } } else if (name.equals("PMID")) { processAccession(bibRef, item); } else if (name.equals("ContributionDate")) { /* * Unusual, but happens for books that are updated with new sections. We use this instead of the * publication date. */ extractBookPublicationYear(bibRef, item); } } }
From source file:ubic.gemma.loader.expression.geo.GeoFamilyParser.java
/** * Extract a key and value pair from a line in the format #key = value. * //from www . ja v a2 s .c o m * @param line. * @return Map containing the String key and String value. Return null if it is misformatted. */ private Map<String, String> extractKeyValue(String line) { if (!line.startsWith("#")) throw new IllegalArgumentException("Wrong type of line"); Map<String, String> result = new HashMap<String, String>(); String fixed = line.substring(line.indexOf('#') + 1); String[] tokens = fixed.split("=", 2); if (tokens.length != 2) { log.warn("Invalid key-value line, expected an '=' somewhere, got: '" + line + "'"); return null; } String key = tokens[0]; String value = tokens[1]; key = StringUtils.strip(key); value = StringUtils.strip(value); result.put(key, value); return result; }
From source file:ubic.gemma.loader.expression.geo.GeoFamilyParser.java
/** * Extract a value from a line in the format xxxx=value. * /*from w ww .j a v a2s . c o m*/ * @param line * @return String following the first occurrence of '=', or null if there is no '=' in the String. */ private String extractValue(String line) { int eqIndex = line.indexOf('='); if (eqIndex < 0) { return null; // that's okay, there are lines that just indicate the end of sections. } return StringUtils.strip(line.substring(eqIndex + 1)); }
From source file:ubic.gemma.loader.expression.geo.service.GeoBrowser.java
/** * Retrieves and parses tab delimited file from GEO. File contains pageSize * GEO records starting from startPage./*from w w w . ja v a2 s .c o m*/ * * @param startPage * @param pageSize * @return list of GeoRecords * @throws IOException * @throws ParseException */ public List<GeoRecord> getRecentGeoRecords(int startPage, int pageSize) throws IOException, ParseException { if (startPage < 0 || pageSize < 0) throw new IllegalArgumentException("Values must be greater than zero "); List<GeoRecord> records = new ArrayList<GeoRecord>(); URL url = null; try { url = new URL(GEO_BROWSE_URL + startPage + GEO_BROWSE_SUFFIX + pageSize); } catch (MalformedURLException e) { throw new RuntimeException("Invalid URL " + url, e); } InputStream is = null; try { URLConnection conn = url.openConnection(); conn.connect(); is = conn.getInputStream(); } catch (IOException e) { log.error(e, e); throw e; } // We are getting a tab delimited file. BufferedReader br = new BufferedReader(new InputStreamReader(is)); // Read columns headers. String headerLine = br.readLine(); String[] headers = StringUtil.csvSplit(headerLine); // Map column names to their indices (handy later). Map<String, Integer> columnNameToIndex = new HashMap<String, Integer>(); for (int i = 0; i < headers.length; i++) { columnNameToIndex.put(headers[i], i); } // Read the rest of the file. String line = null; while ((line = br.readLine()) != null) { String[] fields = StringUtil.csvSplit(line); GeoRecord geoRecord = new GeoRecord(); geoRecord.setGeoAccession(fields[columnNameToIndex.get("Accession")]); geoRecord.setTitle(StringUtils .strip(fields[columnNameToIndex.get("Title")].replaceAll(FLANKING_QUOTES_REGEX, ""))); String sampleCountS = fields[columnNameToIndex.get("Sample Count")]; if (StringUtils.isNotBlank(sampleCountS)) { try { geoRecord.setNumSamples(Integer.parseInt(sampleCountS)); } catch (NumberFormatException e) { throw new RuntimeException("Could not parse sample count: " + sampleCountS); } } else { log.warn("No sample count for " + geoRecord.getGeoAccession()); } geoRecord .setContactName(fields[columnNameToIndex.get("Contact")].replaceAll(FLANKING_QUOTES_REGEX, "")); String[] taxons = fields[columnNameToIndex.get("Taxonomy")].replaceAll(FLANKING_QUOTES_REGEX, "") .split(";"); geoRecord.getOrganisms().addAll(Arrays.asList(taxons)); Date date = DateUtils.parseDate( fields[columnNameToIndex.get("Release Date")].replaceAll(FLANKING_QUOTES_REGEX, ""), DATE_FORMATS); geoRecord.setReleaseDate(date); geoRecord.setSeriesType(fields[columnNameToIndex.get("Series Type")]); records.add(geoRecord); } is.close(); if (records.isEmpty()) { log.warn("No records obtained"); } return records; }
From source file:ubic.gemma.loader.expression.simple.ExperimentalDesignImporterImpl.java
/** * This method reads the file line e.g. $Run time : Category=EnvironmentalHistory Type=categorical and creates * experimental factors from it and adds them to the experimental design. * /*ww w. jav a2s . c o m*/ * @param experimentalDesign Experimental design for this expression experiment * @param experimentalFactorFileLines List of strings representing lines from input file containing experimental * factors * @param headerFields Sample header line split on tab. * @param factorValueLines Lines containing biomaterial names and their factor values */ private void addExperimentalFactorsToExperimentalDesign(ExperimentalDesign experimentalDesign, List<String> experimentalFactorFileLines, String[] headerFields, List<String> factorValueLines) { int maxWait = 0; while (!mgedOntologyService.isOntologyLoaded()) { try { Thread.sleep(1000); if (maxWait++ > 100) { throw new RuntimeException("MGED is not loaded and gave up waiting"); } } catch (InterruptedException e) { e.printStackTrace(); } } log.info("Addding experimental factors to experimental design: " + experimentalDesign.getId()); Collection<OntologyTerm> terms = mgedOntologyService.getMgedTermsByKey("factor"); if (experimentalDesign.getExperimentalFactors() == null) { experimentalDesign.setExperimentalFactors(new HashSet<ExperimentalFactor>()); } Map<String, Set<String>> mapFactorSampleValues = getMapFactorSampleValues(headerFields, factorValueLines); for (String experimentalFactorFileLine : experimentalFactorFileLines) { // $Run time : Category=EnvironmentalHistory Type=categorical String[] experimentalFactorfields = experimentalFactorFileLine.split(":"); String factorValue = (StringUtils.strip(experimentalFactorfields[0] .replaceFirst(Pattern.quote(EXPERIMENTAL_FACTOR_DESCRIPTION_LINE_INDICATOR) + "\\s*", ""))) .trim(); String categoryAndType = StringUtils.strip(experimentalFactorfields[1]); String[] categoryAndTypeFields = StringUtils.split(categoryAndType); // e.g. Category=EnvironmentalHistory String category = categoryAndTypeFields[0]; // e.g. EnvironmentalHistory String categoryValue = StringUtils.split(category, "=")[1]; ExperimentalFactor experimentalFactorFromFile = ExperimentalFactor.Factory.newInstance(); experimentalFactorFromFile.setExperimentalDesign(experimentalDesign); VocabCharacteristic vc = mgedLookup(categoryValue, terms); // e.g. Category=EnvironmentalHistory String categoryTypeValue = categoryAndTypeFields[1]; String factorType = StringUtils.split(categoryTypeValue, "=")[1]; // vc.setCategory( categoryType ); experimentalFactorFromFile.setCategory(vc); experimentalFactorFromFile.setName(factorValue); experimentalFactorFromFile.setDescription(factorValue); experimentalFactorFromFile.setType( factorType.equalsIgnoreCase("CATEGORICAL") ? FactorType.CATEGORICAL : FactorType.CONTINUOUS); addFactorValuesToExperimentalFactor(experimentalFactorFromFile, mapFactorSampleValues, factorType); if (!checkForDuplicateExperimentalFactorOnExperimentalDesign(experimentalDesign, experimentalFactorFromFile)) { // assert experimentalFactorFromFile.getId() != null; experimentalDesign.getExperimentalFactors().add(experimentalFactorFromFile); // here is was the update log.debug("Added experimental factor value " + experimentalFactorFromFile + " to experimental design " + experimentalDesign); } } }
From source file:ubic.gemma.loader.expression.simple.ExperimentalDesignImporterImpl.java
/** * Add the factor values to the biomaterial * //from www .j a v a 2 s .c o m * @param experiment * @param experimentBioMaterials Current expression experiment's biomaterials. * @param experimentalDesign experimental design * @param factorValueLines Lines from file containing factor values and biomaterial ids * @param headerFields * @return Collection of biomaterials associated with this experiment, this is returned as the biomaterial is in a * bioassay (first one retrieved) */ private Collection<BioMaterial> addFactorValuesToBioMaterialsInExpressionExperiment( ExpressionExperiment experiment, Collection<BioMaterial> experimentBioMaterials, ExperimentalDesign experimentalDesign, List<String> factorValueLines, String[] headerFields) { log.debug("Adding factors values to biomaterials: " + experimentalDesign.getId()); Collection<ExperimentalFactor> experimentalFactorsInExperiment = experimentalDesign .getExperimentalFactors(); Collection<BioMaterial> biomaterialsWithFactorValuesInExperiment = new HashSet<BioMaterial>(); Collection<BioMaterial> seenBioMaterials = new HashSet<BioMaterial>(); Map<ExperimentalFactor, Collection<BioMaterial>> factorsAssociatedWithBioMaterials = new HashMap<ExperimentalFactor, Collection<BioMaterial>>(); for (String factorValueLine : factorValueLines) { String[] factorValueFields = StringUtils.splitPreserveAllTokens(factorValueLine, "\t"); String externalId = null; boolean hasExternalId = headerFields[1].toUpperCase().equals("EXTERNALID"); if (hasExternalId) { externalId = factorValueFields[1]; } BioMaterial currentBioMaterial = getBioMaterialFromExpressionExperiment(experiment, experimentBioMaterials, factorValueFields[0], externalId); if (currentBioMaterial == null) { throw new IllegalStateException("No biomaterial for " + factorValueFields[0]); } if (seenBioMaterials.contains(currentBioMaterial)) { throw new IllegalArgumentException( "A biomaterial occurred more than once in the file: " + currentBioMaterial); } seenBioMaterials.add(currentBioMaterial); int start = 1; if (hasExternalId) { start = 2; } for (int i = start; i < factorValueFields.length; i++) { ExperimentalFactor currentExperimentalFactor = null; String currentExperimentalFactorName = StringUtils.strip(headerFields[i]); FactorValue currentFactorValue = null; String currentFactorValueValue = StringUtils.strip(factorValueFields[i]); if (StringUtils.isBlank(currentFactorValueValue)) { // Missing value. Note that catching 'NA' etc. is hard, because they could be valid strings. continue; } for (ExperimentalFactor experimentalFactor : experimentalFactorsInExperiment) { if (experimentalFactor.getName().equals(currentExperimentalFactorName)) { currentExperimentalFactor = experimentalFactor; } } if (currentExperimentalFactor == null) throw new IllegalStateException("No factor matches column " + currentExperimentalFactorName); Collection<FactorValue> factorValuesInCurrentExperimentalFactor = currentExperimentalFactor .getFactorValues(); for (FactorValue factorValue : factorValuesInCurrentExperimentalFactor) { if (factorValue.getValue().trim().equalsIgnoreCase(currentFactorValueValue.trim())) { currentFactorValue = factorValue; } } if (currentFactorValue == null) { log.error("Current factor value not found " + currentExperimentalFactor + currentFactorValueValue); } else { if (!checkForDuplicateFactorOnBioMaterial(currentBioMaterial, currentFactorValue)) { currentBioMaterial.getFactorValues().add(currentFactorValue); } else { // already got warned. } } log.debug("Added factor value " + currentFactorValue + " to biomaterial " + currentBioMaterial); biomaterialsWithFactorValuesInExperiment.add(currentBioMaterial); if (!factorsAssociatedWithBioMaterials.containsKey(currentExperimentalFactor)) { factorsAssociatedWithBioMaterials.put(currentExperimentalFactor, new HashSet<BioMaterial>()); } factorsAssociatedWithBioMaterials.get(currentExperimentalFactor).add(currentBioMaterial); } } /* * Check if every biomaterial got used. Worth a warning, at least. */ for (ExperimentalFactor ef : factorsAssociatedWithBioMaterials.keySet()) { if (!factorsAssociatedWithBioMaterials.get(ef).containsAll(experimentBioMaterials)) { log.warn( "File did not contain values for all factor - biomaterial combinations: Missing at least one for " + ef + " [populated " + factorsAssociatedWithBioMaterials.get(ef).size() + "/" + experimentBioMaterials.size() + " ]"); } } return biomaterialsWithFactorValuesInExperiment; }
From source file:ubic.gemma.loader.expression.simple.ExperimentalDesignImporterImpl.java
/** * Get a map of experimental values keyed on experimental factor name * /*from ww w. j av a 2 s . co m*/ * @param headerFields * @param factorValueLines * @return map of experimental factor values keyed on experimental factor */ private Map<String, Set<String>> getMapFactorSampleValues(String[] headerFields, List<String> factorValueLines) { Map<String, Set<String>> factorSampleValues = new HashMap<String, Set<String>>(); for (String factorValueLine : factorValueLines) { String[] factorValueFields = StringUtils.splitPreserveAllTokens(factorValueLine, "\t"); for (int i = 1; i < headerFields.length; i++) { // get the key String value = headerFields[i]; value = StringUtils.strip(value); String factorValue = StringUtils.strip(factorValueFields[i]); Set<String> listFactorValues = factorSampleValues.get(value); if (listFactorValues == null) { listFactorValues = new HashSet<String>(); } listFactorValues.add(factorValue); factorSampleValues.put(value, listFactorValues); } } return factorSampleValues; }
From source file:ubic.gemma.loader.expression.simple.ExperimentalDesignImporterImpl.java
/** * Validates that the input for experimental factors is correct: Experimental factor file line should be for e.g. * #$Run time : Category=EnvironmentalHistory Type=categorical Checks there is a colon, between experimental factor * and category and that category is correctly formatted. * //from w w w .j a v a 2 s . c o m * @param sampleHeaderLine Lines in file corresponding to order of experimental factors * @param experimentalFactorList The lines in the file corresponding to experimental factors. * @throws IOException Experimental factor lines were not correctly format. */ private void validateExperimentalFactorFileContent(List<String> experimentalFactorLines, String sampleHeaderLine) throws IOException { Set<String> experimentalFactorValueNames = new HashSet<String>(); // validate experimental factor lines for (String line : experimentalFactorLines) { String[] fields = line.split(":"); if (fields.length != 2) { throw new IOException( "EF description must have two fields with a single ':' in between (" + line + ")"); } String factorName = StringUtils.strip(fields[0] .replaceFirst(Pattern.quote(EXPERIMENTAL_FACTOR_DESCRIPTION_LINE_INDICATOR) + "\\s*", "")); experimentalFactorValueNames.add(factorName); String category = StringUtils.strip(fields[1]); String[] descriptions = StringUtils.split(category); if (descriptions.length != 2) { throw new IOException("EF details should have the format 'Category=CATEGORY Type=TYPE'"); } } validateSampleHeaderFileContent(experimentalFactorValueNames, experimentalFactorLines.size(), sampleHeaderLine); }