List of usage examples for org.apache.commons.csv CSVRecord isConsistent
public boolean isConsistent()
From source file:ai.grakn.migration.csv.CSVMigrator.java
/** * Convert data in arrays (from CSV reader) to Map<String, Object>, the current input format for * graql templating.// w w w. ja v a 2 s . com * @param data all bu first row of input file * @return given data in a map */ private Map<String, Object> parse(CSVRecord data) { if (!data.isConsistent()) { throw new RuntimeException("Invalid CSV " + data.toMap()); } return data.toMap().entrySet().stream().filter((e) -> validValue(e.getValue())) .collect(toMap(Map.Entry::getKey, Map.Entry::getValue)); }
From source file:com.datascience.hadoop.CsvRecordReader.java
@Override public boolean next(LongWritable key, ListWritable<Text> value) throws IOException { value.clear();//from w w w .j a va 2s . c o m try { if (iterator.hasNext()) { CSVRecord record = iterator.next(); position++; colLength = colLength == null ? record.size() : colLength; if ((!record.isConsistent() || record.size() != colLength) && strict) { String message = String.format("%s: %s", "inconsistent record at position", position); throw new CsvParseException(message); } key.set(record.getRecordNumber()); for (int i = 0; i < record.size(); i++) { String item = record.get(i); if (item == null) { value.add(null); } else { Text text = cache[i]; if (text == null) { text = new Text(); cache[i] = text; } text.set(item); value.add(text); } } //position = record.getCharacterPosition(); return true; } } catch (Exception e) { LOGGER.warn("failed to parse record at position: " + position); if (strict) { throw e; } else { return next(key, value); } } return false; }
From source file:com.xceptance.xlt.common.tests.AbstractURLTestCase.java
/** * Loading of the data. There is a state variable used to indicate that we already did that. * //from ww w .j av a 2s .c o m * @throws IOException */ @Before public void loadData() throws IOException { login = getProperty("login", getProperty("com.xceptance.xlt.auth.userName")); password = getProperty("password", getProperty("com.xceptance.xlt.auth.password")); // load the data. Ideally we would offload the file searching to // XltProperties.getDataFile(String name) // or XltProperties.getDataFile(String name, String locale) // or XltProperties.getDataFile(String name, Locale locale) final String dataDirectory = XltProperties.getInstance().getProperty( XltConstants.XLT_PACKAGE_PATH + ".data.directory", "config" + File.separatorChar + "data"); final File file = new File(dataDirectory, getProperty("filename", Session.getCurrent().getUserName() + ".csv")); BufferedReader br = null; boolean incorrectLines = false; try { br = new BufferedReader(new InputStreamReader(new FileInputStream(file), "UTF-8")); // permit # as comment, empty lines, set comma as separator, and activate the header final CSVFormat csvFormat = CSVFormat.RFC4180.toBuilder().withIgnoreEmptyLines(true) .withCommentStart('#').withHeader().withIgnoreSurroundingSpaces(true).build(); final CSVParser parser = new CSVParser(br, csvFormat); final Iterator<CSVRecord> csvRecords = parser.iterator(); // verify header fields to avoid problems with incorrect spelling or spaces final Map<String, Integer> headerMap = parser.getHeaderMap(); for (final String headerField : headerMap.keySet()) { if (!CSVBasedURLAction.isPermittedHeaderField(headerField)) { Assert.fail(MessageFormat.format("Unsupported or misspelled header field: {0}", headerField)); } } // go over all lines, this is a little odd, because we have to catch the iterator exception while (true) { try { final boolean hasNext = csvRecords.hasNext(); if (!hasNext) { break; } } catch (final Exception e) { // the plus 1 is meant to correct the increment missing because of the exception throw new RuntimeException( MessageFormat.format("Line at {0} is invalid, because of <{1}>. Line is ignored.", parser.getLineNumber() + 1, e.getMessage())); } final CSVRecord csvRecord = csvRecords.next(); // only take ok lines if (csvRecord.isConsistent()) { // guard against data exceptions try { // do we have an url? if (csvRecord.get(CSVBasedURLAction.URL) != null) { // take it csvBasedActions.add(new CSVBasedURLAction(csvRecord, interpreter)); } else { XltLogger.runTimeLogger.error(MessageFormat.format( "Line at {0} does not contain any URL. Line is ignored: {1}", parser.getLineNumber(), csvRecord)); } } catch (final Exception e) { throw new RuntimeException(MessageFormat.format( "Line at {0} is invalid, because of <{2}>. Line is ignored: {1}", parser.getLineNumber(), csvRecord, e.getMessage())); } } else { XltLogger.runTimeLogger.error(MessageFormat.format( "Line at {0} has not been correctly formatted. Line is ignored: {1}", parser.getLineNumber(), csvRecord)); incorrectLines = true; } } } finally { IOUtils.closeQuietly(br); } // stop if we have anything the is incorrect, avoid half running test cases if (incorrectLines) { throw new RuntimeException("Found incorrectly formatted lines. Stopping here."); } }
From source file:edu.harvard.iq.dataverse.ingest.tabulardata.impl.plugins.csv.CSVFileReader.java
public int readFile(BufferedReader csvReader, DataTable dataTable, PrintWriter finalOut) throws IOException { List<DataVariable> variableList = new ArrayList<>(); CSVParser parser = new CSVParser(csvReader, inFormat.withHeader()); Map<String, Integer> headers = parser.getHeaderMap(); int i = 0;//w w w . j a va 2 s . c o m for (String varName : headers.keySet()) { if (varName == null || varName.isEmpty()) { // TODO: // Add a sensible variable name validation algorithm. // -- L.A. 4.0 alpha 1 throw new IOException(BundleUtil.getStringFromBundle("ingest.csv.invalidHeader")); } DataVariable dv = new DataVariable(); dv.setName(varName); dv.setLabel(varName); dv.setInvalidRanges(new ArrayList<>()); dv.setSummaryStatistics(new ArrayList<>()); dv.setUnf("UNF:6:NOTCALCULATED"); dv.setCategories(new ArrayList<>()); variableList.add(dv); dv.setTypeCharacter(); dv.setIntervalDiscrete(); dv.setFileOrder(i); dv.setDataTable(dataTable); i++; } dataTable.setVarQuantity((long) variableList.size()); dataTable.setDataVariables(variableList); boolean[] isNumericVariable = new boolean[headers.size()]; boolean[] isIntegerVariable = new boolean[headers.size()]; boolean[] isTimeVariable = new boolean[headers.size()]; boolean[] isDateVariable = new boolean[headers.size()]; for (i = 0; i < headers.size(); i++) { // OK, let's assume that every variable is numeric; // but we'll go through the file and examine every value; the // moment we find a value that's not a legit numeric one, we'll // assume that it is in fact a String. isNumericVariable[i] = true; isIntegerVariable[i] = true; isDateVariable[i] = true; isTimeVariable[i] = true; } // First, "learning" pass. // (we'll save the incoming stream in another temp file:) SimpleDateFormat[] selectedDateTimeFormat = new SimpleDateFormat[headers.size()]; SimpleDateFormat[] selectedDateFormat = new SimpleDateFormat[headers.size()]; File firstPassTempFile = File.createTempFile("firstpass-", ".csv"); try (CSVPrinter csvFilePrinter = new CSVPrinter( // TODO allow other parsers of tabular data to use this parser by changin inFormat new FileWriter(firstPassTempFile.getAbsolutePath()), inFormat)) { //Write headers csvFilePrinter.printRecord(headers.keySet()); for (CSVRecord record : parser.getRecords()) { // Checks if #records = #columns in header if (!record.isConsistent()) { List<String> args = Arrays.asList(new String[] { "" + (parser.getCurrentLineNumber() - 1), "" + headers.size(), "" + record.size() }); throw new IOException(BundleUtil.getStringFromBundle("ingest.csv.recordMismatch", args)); } for (i = 0; i < headers.size(); i++) { String varString = record.get(i); isIntegerVariable[i] = isIntegerVariable[i] && varString != null && (varString.isEmpty() || varString.equals("null") || (firstNumCharSet.contains(varString.charAt(0)) && StringUtils.isNumeric(varString.substring(1)))); if (isNumericVariable[i]) { // If variable might be "numeric" test to see if this value is a parsable number: if (varString != null && !varString.isEmpty()) { boolean isNumeric = false; boolean isInteger = false; if (varString.equalsIgnoreCase("NaN") || varString.equalsIgnoreCase("NA") || varString.equalsIgnoreCase("Inf") || varString.equalsIgnoreCase("+Inf") || varString.equalsIgnoreCase("-Inf") || varString.equalsIgnoreCase("null")) { continue; } else { try { Double testDoubleValue = new Double(varString); continue; } catch (NumberFormatException ex) { // the token failed to parse as a double // so the column is a string variable. } } isNumericVariable[i] = false; } } // If this is not a numeric column, see if it is a date collumn // by parsing the cell as a date or date-time value: if (!isNumericVariable[i]) { Date dateResult = null; if (isTimeVariable[i]) { if (varString != null && !varString.isEmpty()) { boolean isTime = false; if (selectedDateTimeFormat[i] != null) { ParsePosition pos = new ParsePosition(0); dateResult = selectedDateTimeFormat[i].parse(varString, pos); if (dateResult != null && pos.getIndex() == varString.length()) { // OK, successfully parsed a value! isTime = true; } } else { for (SimpleDateFormat format : TIME_FORMATS) { ParsePosition pos = new ParsePosition(0); dateResult = format.parse(varString, pos); if (dateResult != null && pos.getIndex() == varString.length()) { // OK, successfully parsed a value! isTime = true; selectedDateTimeFormat[i] = format; break; } } } if (!isTime) { isTimeVariable[i] = false; // if the token didn't parse as a time value, // we will still try to parse it as a date, below. // unless this column is NOT a date. } else { // And if it is a time value, we are going to assume it's // NOT a date. isDateVariable[i] = false; } } } if (isDateVariable[i]) { if (varString != null && !varString.isEmpty()) { boolean isDate = false; // TODO: // Strictly speaking, we should be doing the same thing // here as with the time formats above; select the // first one that works, then insist that all the // other values in this column match it... but we // only have one, as of now, so it should be ok. // -- L.A. 4.0 beta for (SimpleDateFormat format : DATE_FORMATS) { // Strict parsing - it will throw an // exception if it doesn't parse! format.setLenient(false); try { format.parse(varString); isDate = true; selectedDateFormat[i] = format; break; } catch (ParseException ex) { //Do nothing } } isDateVariable[i] = isDate; } } } } csvFilePrinter.printRecord(record); } } dataTable.setCaseQuantity(parser.getRecordNumber()); parser.close(); csvReader.close(); // Re-type the variables that we've determined are numerics: for (i = 0; i < headers.size(); i++) { if (isNumericVariable[i]) { dataTable.getDataVariables().get(i).setTypeNumeric(); if (isIntegerVariable[i]) { dataTable.getDataVariables().get(i).setIntervalDiscrete(); } else { dataTable.getDataVariables().get(i).setIntervalContinuous(); } } else if (isDateVariable[i] && selectedDateFormat[i] != null) { // Dates are still Strings, i.e., they are "character" and "discrete"; // But we add special format values for them: dataTable.getDataVariables().get(i).setFormat(DATE_FORMATS[0].toPattern()); dataTable.getDataVariables().get(i).setFormatCategory("date"); } else if (isTimeVariable[i] && selectedDateTimeFormat[i] != null) { // Same for time values: dataTable.getDataVariables().get(i).setFormat(selectedDateTimeFormat[i].toPattern()); dataTable.getDataVariables().get(i).setFormatCategory("time"); } } // Second, final pass. try (BufferedReader secondPassReader = new BufferedReader(new FileReader(firstPassTempFile))) { parser = new CSVParser(secondPassReader, inFormat.withHeader()); String[] caseRow = new String[headers.size()]; for (CSVRecord record : parser) { if (!record.isConsistent()) { List<String> args = Arrays.asList(new String[] { "" + (parser.getCurrentLineNumber() - 1), "" + headers.size(), "" + record.size() }); throw new IOException(BundleUtil.getStringFromBundle("ingest.csv.recordMismatch", args)); } for (i = 0; i < headers.size(); i++) { String varString = record.get(i); if (isNumericVariable[i]) { if (varString == null || varString.isEmpty() || varString.equalsIgnoreCase("NA")) { // Missing value - represented as an empty string in // the final tab file caseRow[i] = ""; } else if (varString.equalsIgnoreCase("NaN")) { // "Not a Number" special value: caseRow[i] = "NaN"; } else if (varString.equalsIgnoreCase("Inf") || varString.equalsIgnoreCase("+Inf")) { // Positive infinity: caseRow[i] = "Inf"; } else if (varString.equalsIgnoreCase("-Inf")) { // Negative infinity: caseRow[i] = "-Inf"; } else if (varString.equalsIgnoreCase("null")) { // By request from Gus - "NULL" is recognized as a // numeric zero: caseRow[i] = isIntegerVariable[i] ? "0" : "0.0"; } else { /* No re-formatting is done on any other numeric values. * We'll save them as they were, for archival purposes. * The alternative solution - formatting in sci. notation * is commented-out below. */ caseRow[i] = varString; /* if (isIntegerVariable[i]) { try { Integer testIntegerValue = new Integer(varString); caseRow[i] = testIntegerValue.toString(); } catch (NumberFormatException ex) { throw new IOException("Failed to parse a value recognized as an integer in the first pass! (?)"); } } else { try { Double testDoubleValue = new Double(varString); if (testDoubleValue.equals(0.0)) { caseRow[i] = "0.0"; } else { // One possible implementation: // // Round our fractional values to 15 digits // (minimum number of digits of precision guaranteed by // type Double) and format the resulting representations // in a IEEE 754-like "scientific notation" - for ex., // 753.24 will be encoded as 7.5324e2 BigDecimal testBigDecimal = new BigDecimal(varString, doubleMathContext); caseRow[i] = String.format(FORMAT_IEEE754, testBigDecimal); // Strip meaningless zeros and extra + signs: caseRow[i] = caseRow[i].replaceFirst("00*e", "e"); caseRow[i] = caseRow[i].replaceFirst("\\.e", ".0e"); caseRow[i] = caseRow[i].replaceFirst("e\\+00", ""); caseRow[i] = caseRow[i].replaceFirst("^\\+", ""); } } catch (NumberFormatException ex) { throw new IOException("Failed to parse a value recognized as numeric in the first pass! (?)"); } } */ } } else if (isTimeVariable[i] || isDateVariable[i]) { // Time and Dates are stored NOT quoted (don't ask). if (varString != null) { // Dealing with quotes: // remove the leading and trailing quotes, if present: varString = varString.replaceFirst("^\"*", ""); varString = varString.replaceFirst("\"*$", ""); caseRow[i] = varString; } else { caseRow[i] = ""; } } else { // Treat as a String: // Strings are stored in tab files quoted; // Missing values are stored as an empty string // between two tabs (or one tab and the new line); // Empty strings stored as "" (quoted empty string). // For the purposes of this CSV ingest reader, we are going // to assume that all the empty strings in the file are // indeed empty strings, and NOT missing values: if (varString != null) { // escape the quotes, newlines, and tabs: varString = varString.replace("\"", "\\\""); varString = varString.replace("\n", "\\n"); varString = varString.replace("\t", "\\t"); // final pair of quotes: varString = "\"" + varString + "\""; caseRow[i] = varString; } else { caseRow[i] = "\"\""; } } } finalOut.println(StringUtils.join(caseRow, "\t")); } } long linecount = parser.getRecordNumber(); finalOut.close(); parser.close(); dbglog.fine("Tmp File: " + firstPassTempFile); // Firstpass file is deleted to prevent tmp from filling up. firstPassTempFile.delete(); if (dataTable.getCaseQuantity().intValue() != linecount) { List<String> args = Arrays .asList(new String[] { "" + dataTable.getCaseQuantity().intValue(), "" + linecount }); throw new IOException(BundleUtil.getStringFromBundle("ingest.csv.line_mismatch", args)); } return (int) linecount; }
From source file:norbert.mynemo.dataimport.scraping.CkMapping.java
/** * Returns <code>true</code> if the mapping can be created from the given parameters. Returns * <code>false</code> otherwise. *//*w w w . j a va2 s . c o m*/ public static boolean isValid(CSVRecord record) { return record.isConsistent() && record.isMapped(CK_MOVIE_HEADER) && record.isMapped(IMDB_MOVIE_HEADER); }
From source file:norbert.mynemo.dataimport.scraping.CkRating.java
/** * Returns <code>true</code> if the rating can be created from the given parameters. Returns * <code>false</code> otherwise. *//* ww w.j av a2s .c om*/ public static boolean isValid(CSVRecord record) { return record != null && record.isConsistent() && record.isMapped(USER_HEADER) && record.isMapped(MOVIE_HEADER) && record.isMapped(VALUE_HEADER); }
From source file:org.apache.phoenix.mapreduce.CsvToKeyValueMapperTest.java
@Test public void testCsvLineParser() throws IOException { CsvToKeyValueMapper.CsvLineParser lineParser = new CsvToKeyValueMapper.CsvLineParser(';', '"', '\\'); CSVRecord parsed = lineParser.parse("one;two"); assertEquals("one", parsed.get(0)); assertEquals("two", parsed.get(1)); assertTrue(parsed.isConsistent()); assertEquals(1, parsed.getRecordNumber()); }
From source file:org.apache.phoenix.mapreduce.CsvToKeyValueMapperTest.java
@Test public void testCsvLineParserWithQuoting() throws IOException { CsvToKeyValueMapper.CsvLineParser lineParser = new CsvToKeyValueMapper.CsvLineParser(';', '"', '\\'); CSVRecord parsed = lineParser.parse("\"\\\"one\";\"\\;two\\\\\""); assertEquals("\"one", parsed.get(0)); assertEquals(";two\\", parsed.get(1)); assertTrue(parsed.isConsistent()); assertEquals(1, parsed.getRecordNumber()); }
From source file:org.nuxeo.ecm.directory.DirectoryCSVLoader.java
/** * Loads the CSV data file based on the provided schema, and creates the corresponding entries using the provided * loader./*from www . j av a2 s. co m*/ * * @param dataFileName the file name containing CSV data * @param delimiter the CSV column separator * @param schema the data schema * @param loader the actual consumer of loaded rows * @since 8.4 */ public static void loadData(String dataFileName, char delimiter, Schema schema, Consumer<Map<String, Object>> loader) throws DirectoryException { try (InputStream in = getResource(dataFileName); // CSVParser csvParser = new CSVParser(new InputStreamReader(in, "UTF-8"), CSVFormat.DEFAULT.withDelimiter(delimiter).withHeader())) { Map<String, Integer> header = csvParser.getHeaderMap(); List<Field> fields = new ArrayList<>(); for (String columnName : header.keySet()) { Field field = schema.getField(columnName.trim()); if (field == null) { throw new DirectoryException( "Column not found: " + columnName + " in schema: " + schema.getName()); } fields.add(field); } int lineno = 1; // header was first line for (CSVRecord record : csvParser) { lineno++; if (record.size() == 0 || record.size() == 1 && StringUtils.isBlank(record.get(0))) { // NXP-2538: allow columns with only one value but skip empty lines continue; } if (!record.isConsistent()) { log.error("Invalid column count while reading CSV file: " + dataFileName + ", line: " + lineno + ", values: " + record); continue; } Map<String, Object> map = new HashMap<String, Object>(); for (int i = 0; i < header.size(); i++) { Field field = fields.get(i); String value = record.get(i); Object v = CSV_NULL_MARKER.equals(value) ? null : decode(field, value); map.put(field.getName().getPrefixedName(), v); } loader.accept(map); } } catch (IOException e) { throw new DirectoryException("Read error while reading data file: " + dataFileName, e); } }
From source file:org.openlmis.fulfillment.Resource2Db.java
Pair<List<String>, List<Object[]>> resourceCsvToBatchedPair(final Resource resource) throws IOException { XLOGGER.entry(resource.getDescription()); // parse CSV// ww w . j a v a 2 s. c o m try (InputStreamReader isReader = new InputStreamReader( new BOMInputStream(resource.getInputStream(), ByteOrderMark.UTF_8))) { CSVParser parser = CSVFormat.DEFAULT.withHeader().withNullString("").parse(isReader); // read header row MutablePair<List<String>, List<Object[]>> readData = new MutablePair<>(); readData.setLeft(new ArrayList<>(parser.getHeaderMap().keySet())); XLOGGER.info("Read header: " + readData.getLeft()); // read data rows List<Object[]> rows = new ArrayList<>(); for (CSVRecord record : parser.getRecords()) { if (!record.isConsistent()) { throw new IllegalArgumentException("CSV record inconsistent: " + record); } List theRow = IteratorUtils.toList(record.iterator()); rows.add(theRow.toArray()); } readData.setRight(rows); XLOGGER.exit("Records read: " + readData.getRight().size()); return readData; } }