Example usage for org.apache.commons.csv CSVRecord isConsistent

List of usage examples for org.apache.commons.csv CSVRecord isConsistent

Introduction

In this page you can find the example usage for org.apache.commons.csv CSVRecord isConsistent.

Prototype

public boolean isConsistent() 

Source Link

Document

Tells whether the record size matches the header size.

Usage

From source file:ai.grakn.migration.csv.CSVMigrator.java

/**
 * Convert data in arrays (from CSV reader) to Map<String, Object>, the current input format for
 * graql templating.//  w w  w. ja v a 2 s  .  com
 * @param data all bu first row of input file
 * @return given data in a map
 */
private Map<String, Object> parse(CSVRecord data) {
    if (!data.isConsistent()) {
        throw new RuntimeException("Invalid CSV " + data.toMap());
    }
    return data.toMap().entrySet().stream().filter((e) -> validValue(e.getValue()))
            .collect(toMap(Map.Entry::getKey, Map.Entry::getValue));
}

From source file:com.datascience.hadoop.CsvRecordReader.java

@Override
public boolean next(LongWritable key, ListWritable<Text> value) throws IOException {
    value.clear();//from  w w  w  .j a va  2s  .  c  o m
    try {
        if (iterator.hasNext()) {
            CSVRecord record = iterator.next();
            position++;
            colLength = colLength == null ? record.size() : colLength;
            if ((!record.isConsistent() || record.size() != colLength) && strict) {
                String message = String.format("%s: %s", "inconsistent record at position", position);
                throw new CsvParseException(message);
            }

            key.set(record.getRecordNumber());

            for (int i = 0; i < record.size(); i++) {
                String item = record.get(i);
                if (item == null) {
                    value.add(null);
                } else {
                    Text text = cache[i];
                    if (text == null) {
                        text = new Text();
                        cache[i] = text;
                    }
                    text.set(item);
                    value.add(text);
                }
            }
            //position = record.getCharacterPosition();
            return true;
        }

    } catch (Exception e) {
        LOGGER.warn("failed to parse record at position: " + position);
        if (strict) {
            throw e;
        } else {
            return next(key, value);
        }
    }
    return false;
}

From source file:com.xceptance.xlt.common.tests.AbstractURLTestCase.java

/**
 * Loading of the data. There is a state variable used to indicate that we already did that.
 * //from ww w .j  av a  2s  .c  o  m
 * @throws IOException
 */
@Before
public void loadData() throws IOException {
    login = getProperty("login", getProperty("com.xceptance.xlt.auth.userName"));
    password = getProperty("password", getProperty("com.xceptance.xlt.auth.password"));

    // load the data. Ideally we would offload the file searching to
    // XltProperties.getDataFile(String name)
    // or XltProperties.getDataFile(String name, String locale)
    // or XltProperties.getDataFile(String name, Locale locale)
    final String dataDirectory = XltProperties.getInstance().getProperty(
            XltConstants.XLT_PACKAGE_PATH + ".data.directory", "config" + File.separatorChar + "data");
    final File file = new File(dataDirectory,
            getProperty("filename", Session.getCurrent().getUserName() + ".csv"));

    BufferedReader br = null;
    boolean incorrectLines = false;

    try {
        br = new BufferedReader(new InputStreamReader(new FileInputStream(file), "UTF-8"));

        // permit # as comment, empty lines, set comma as separator, and activate the header
        final CSVFormat csvFormat = CSVFormat.RFC4180.toBuilder().withIgnoreEmptyLines(true)
                .withCommentStart('#').withHeader().withIgnoreSurroundingSpaces(true).build();
        final CSVParser parser = new CSVParser(br, csvFormat);
        final Iterator<CSVRecord> csvRecords = parser.iterator();

        // verify header fields to avoid problems with incorrect spelling or spaces
        final Map<String, Integer> headerMap = parser.getHeaderMap();

        for (final String headerField : headerMap.keySet()) {
            if (!CSVBasedURLAction.isPermittedHeaderField(headerField)) {
                Assert.fail(MessageFormat.format("Unsupported or misspelled header field: {0}", headerField));
            }
        }

        // go over all lines, this is a little odd, because we have to catch the iterator exception
        while (true) {
            try {
                final boolean hasNext = csvRecords.hasNext();
                if (!hasNext) {
                    break;
                }
            } catch (final Exception e) {
                // the plus 1 is meant to correct the increment missing because of the exception
                throw new RuntimeException(
                        MessageFormat.format("Line at {0} is invalid, because of <{1}>. Line is ignored.",
                                parser.getLineNumber() + 1, e.getMessage()));
            }

            final CSVRecord csvRecord = csvRecords.next();

            // only take ok lines
            if (csvRecord.isConsistent()) {
                // guard against data exceptions
                try {
                    // do we have an url?
                    if (csvRecord.get(CSVBasedURLAction.URL) != null) {
                        // take it
                        csvBasedActions.add(new CSVBasedURLAction(csvRecord, interpreter));
                    } else {
                        XltLogger.runTimeLogger.error(MessageFormat.format(
                                "Line at {0} does not contain any URL. Line is ignored: {1}",
                                parser.getLineNumber(), csvRecord));
                    }
                } catch (final Exception e) {
                    throw new RuntimeException(MessageFormat.format(
                            "Line at {0} is invalid, because of <{2}>. Line is ignored: {1}",
                            parser.getLineNumber(), csvRecord, e.getMessage()));
                }
            } else {
                XltLogger.runTimeLogger.error(MessageFormat.format(
                        "Line at {0} has not been correctly formatted. Line is ignored: {1}",
                        parser.getLineNumber(), csvRecord));
                incorrectLines = true;
            }
        }
    } finally {
        IOUtils.closeQuietly(br);
    }

    // stop if we have anything the is incorrect, avoid half running test cases
    if (incorrectLines) {
        throw new RuntimeException("Found incorrectly formatted lines. Stopping here.");
    }
}

From source file:edu.harvard.iq.dataverse.ingest.tabulardata.impl.plugins.csv.CSVFileReader.java

public int readFile(BufferedReader csvReader, DataTable dataTable, PrintWriter finalOut) throws IOException {

    List<DataVariable> variableList = new ArrayList<>();
    CSVParser parser = new CSVParser(csvReader, inFormat.withHeader());
    Map<String, Integer> headers = parser.getHeaderMap();

    int i = 0;//w w  w  . j  a  va 2 s . c  o m
    for (String varName : headers.keySet()) {
        if (varName == null || varName.isEmpty()) {
            // TODO:
            // Add a sensible variable name validation algorithm.
            // -- L.A. 4.0 alpha 1
            throw new IOException(BundleUtil.getStringFromBundle("ingest.csv.invalidHeader"));
        }

        DataVariable dv = new DataVariable();
        dv.setName(varName);
        dv.setLabel(varName);
        dv.setInvalidRanges(new ArrayList<>());
        dv.setSummaryStatistics(new ArrayList<>());
        dv.setUnf("UNF:6:NOTCALCULATED");
        dv.setCategories(new ArrayList<>());
        variableList.add(dv);

        dv.setTypeCharacter();
        dv.setIntervalDiscrete();
        dv.setFileOrder(i);
        dv.setDataTable(dataTable);
        i++;
    }

    dataTable.setVarQuantity((long) variableList.size());
    dataTable.setDataVariables(variableList);

    boolean[] isNumericVariable = new boolean[headers.size()];
    boolean[] isIntegerVariable = new boolean[headers.size()];
    boolean[] isTimeVariable = new boolean[headers.size()];
    boolean[] isDateVariable = new boolean[headers.size()];

    for (i = 0; i < headers.size(); i++) {
        // OK, let's assume that every variable is numeric;
        // but we'll go through the file and examine every value; the
        // moment we find a value that's not a legit numeric one, we'll
        // assume that it is in fact a String.
        isNumericVariable[i] = true;
        isIntegerVariable[i] = true;
        isDateVariable[i] = true;
        isTimeVariable[i] = true;
    }

    // First, "learning" pass.
    // (we'll save the incoming stream in another temp file:)
    SimpleDateFormat[] selectedDateTimeFormat = new SimpleDateFormat[headers.size()];
    SimpleDateFormat[] selectedDateFormat = new SimpleDateFormat[headers.size()];

    File firstPassTempFile = File.createTempFile("firstpass-", ".csv");

    try (CSVPrinter csvFilePrinter = new CSVPrinter(
            // TODO allow other parsers of tabular data to use this parser by changin inFormat
            new FileWriter(firstPassTempFile.getAbsolutePath()), inFormat)) {
        //Write  headers
        csvFilePrinter.printRecord(headers.keySet());
        for (CSVRecord record : parser.getRecords()) {
            // Checks if #records = #columns in header
            if (!record.isConsistent()) {
                List<String> args = Arrays.asList(new String[] { "" + (parser.getCurrentLineNumber() - 1),
                        "" + headers.size(), "" + record.size() });
                throw new IOException(BundleUtil.getStringFromBundle("ingest.csv.recordMismatch", args));
            }

            for (i = 0; i < headers.size(); i++) {
                String varString = record.get(i);
                isIntegerVariable[i] = isIntegerVariable[i] && varString != null
                        && (varString.isEmpty() || varString.equals("null")
                                || (firstNumCharSet.contains(varString.charAt(0))
                                        && StringUtils.isNumeric(varString.substring(1))));
                if (isNumericVariable[i]) {
                    // If variable might be "numeric" test to see if this value is a parsable number:
                    if (varString != null && !varString.isEmpty()) {

                        boolean isNumeric = false;
                        boolean isInteger = false;

                        if (varString.equalsIgnoreCase("NaN") || varString.equalsIgnoreCase("NA")
                                || varString.equalsIgnoreCase("Inf") || varString.equalsIgnoreCase("+Inf")
                                || varString.equalsIgnoreCase("-Inf") || varString.equalsIgnoreCase("null")) {
                            continue;
                        } else {
                            try {
                                Double testDoubleValue = new Double(varString);
                                continue;
                            } catch (NumberFormatException ex) {
                                // the token failed to parse as a double
                                // so the column is a string variable.
                            }
                        }
                        isNumericVariable[i] = false;
                    }
                }

                // If this is not a numeric column, see if it is a date collumn
                // by parsing the cell as a date or date-time value:
                if (!isNumericVariable[i]) {

                    Date dateResult = null;

                    if (isTimeVariable[i]) {
                        if (varString != null && !varString.isEmpty()) {
                            boolean isTime = false;

                            if (selectedDateTimeFormat[i] != null) {
                                ParsePosition pos = new ParsePosition(0);
                                dateResult = selectedDateTimeFormat[i].parse(varString, pos);

                                if (dateResult != null && pos.getIndex() == varString.length()) {
                                    // OK, successfully parsed a value!
                                    isTime = true;
                                }
                            } else {
                                for (SimpleDateFormat format : TIME_FORMATS) {
                                    ParsePosition pos = new ParsePosition(0);
                                    dateResult = format.parse(varString, pos);
                                    if (dateResult != null && pos.getIndex() == varString.length()) {
                                        // OK, successfully parsed a value!
                                        isTime = true;
                                        selectedDateTimeFormat[i] = format;
                                        break;
                                    }
                                }
                            }
                            if (!isTime) {
                                isTimeVariable[i] = false;
                                // if the token didn't parse as a time value,
                                // we will still try to parse it as a date, below.
                                // unless this column is NOT a date.
                            } else {
                                // And if it is a time value, we are going to assume it's
                                // NOT a date.
                                isDateVariable[i] = false;
                            }
                        }
                    }

                    if (isDateVariable[i]) {
                        if (varString != null && !varString.isEmpty()) {
                            boolean isDate = false;

                            // TODO:
                            // Strictly speaking, we should be doing the same thing
                            // here as with the time formats above; select the
                            // first one that works, then insist that all the
                            // other values in this column match it... but we
                            // only have one, as of now, so it should be ok.
                            // -- L.A. 4.0 beta
                            for (SimpleDateFormat format : DATE_FORMATS) {
                                // Strict parsing - it will throw an
                                // exception if it doesn't parse!
                                format.setLenient(false);
                                try {
                                    format.parse(varString);
                                    isDate = true;
                                    selectedDateFormat[i] = format;
                                    break;
                                } catch (ParseException ex) {
                                    //Do nothing
                                }
                            }
                            isDateVariable[i] = isDate;
                        }
                    }
                }
            }

            csvFilePrinter.printRecord(record);
        }
    }
    dataTable.setCaseQuantity(parser.getRecordNumber());
    parser.close();
    csvReader.close();

    // Re-type the variables that we've determined are numerics:
    for (i = 0; i < headers.size(); i++) {
        if (isNumericVariable[i]) {
            dataTable.getDataVariables().get(i).setTypeNumeric();

            if (isIntegerVariable[i]) {
                dataTable.getDataVariables().get(i).setIntervalDiscrete();
            } else {
                dataTable.getDataVariables().get(i).setIntervalContinuous();
            }
        } else if (isDateVariable[i] && selectedDateFormat[i] != null) {
            // Dates are still Strings, i.e., they are "character" and "discrete";
            // But we add special format values for them:
            dataTable.getDataVariables().get(i).setFormat(DATE_FORMATS[0].toPattern());
            dataTable.getDataVariables().get(i).setFormatCategory("date");
        } else if (isTimeVariable[i] && selectedDateTimeFormat[i] != null) {
            // Same for time values:
            dataTable.getDataVariables().get(i).setFormat(selectedDateTimeFormat[i].toPattern());
            dataTable.getDataVariables().get(i).setFormatCategory("time");
        }
    }
    // Second, final pass.
    try (BufferedReader secondPassReader = new BufferedReader(new FileReader(firstPassTempFile))) {
        parser = new CSVParser(secondPassReader, inFormat.withHeader());
        String[] caseRow = new String[headers.size()];

        for (CSVRecord record : parser) {
            if (!record.isConsistent()) {
                List<String> args = Arrays.asList(new String[] { "" + (parser.getCurrentLineNumber() - 1),
                        "" + headers.size(), "" + record.size() });
                throw new IOException(BundleUtil.getStringFromBundle("ingest.csv.recordMismatch", args));
            }

            for (i = 0; i < headers.size(); i++) {
                String varString = record.get(i);
                if (isNumericVariable[i]) {
                    if (varString == null || varString.isEmpty() || varString.equalsIgnoreCase("NA")) {
                        // Missing value - represented as an empty string in
                        // the final tab file
                        caseRow[i] = "";
                    } else if (varString.equalsIgnoreCase("NaN")) {
                        // "Not a Number" special value:
                        caseRow[i] = "NaN";
                    } else if (varString.equalsIgnoreCase("Inf") || varString.equalsIgnoreCase("+Inf")) {
                        // Positive infinity:
                        caseRow[i] = "Inf";
                    } else if (varString.equalsIgnoreCase("-Inf")) {
                        // Negative infinity:
                        caseRow[i] = "-Inf";
                    } else if (varString.equalsIgnoreCase("null")) {
                        // By request from Gus - "NULL" is recognized as a
                        // numeric zero:
                        caseRow[i] = isIntegerVariable[i] ? "0" : "0.0";
                    } else {
                        /* No re-formatting is done on any other numeric values.
                         * We'll save them as they were, for archival purposes.
                         * The alternative solution - formatting in sci. notation
                         * is commented-out below.
                         */
                        caseRow[i] = varString;
                        /*
                         if (isIntegerVariable[i]) {
                        try {
                            Integer testIntegerValue = new Integer(varString);
                            caseRow[i] = testIntegerValue.toString();
                        } catch (NumberFormatException ex) {
                            throw new IOException("Failed to parse a value recognized as an integer in the first pass! (?)");
                        }
                        } else {
                        try {
                            Double testDoubleValue = new Double(varString);
                            if (testDoubleValue.equals(0.0)) {
                                caseRow[i] = "0.0";
                            } else {
                                                                    // One possible implementation:
                                //
                                // Round our fractional values to 15 digits
                                // (minimum number of digits of precision guaranteed by
                                // type Double) and format the resulting representations
                                // in a IEEE 754-like "scientific notation" - for ex.,
                                // 753.24 will be encoded as 7.5324e2
                                BigDecimal testBigDecimal = new BigDecimal(varString, doubleMathContext);
                                caseRow[i] = String.format(FORMAT_IEEE754, testBigDecimal);
                                
                                // Strip meaningless zeros and extra + signs:
                                caseRow[i] = caseRow[i].replaceFirst("00*e", "e");
                                caseRow[i] = caseRow[i].replaceFirst("\\.e", ".0e");
                                caseRow[i] = caseRow[i].replaceFirst("e\\+00", "");
                                caseRow[i] = caseRow[i].replaceFirst("^\\+", "");
                            }
                        } catch (NumberFormatException ex) {
                            throw new IOException("Failed to parse a value recognized as numeric in the first pass! (?)");
                        }
                        }
                         */
                    }
                } else if (isTimeVariable[i] || isDateVariable[i]) {
                    // Time and Dates are stored NOT quoted (don't ask).
                    if (varString != null) {
                        // Dealing with quotes:
                        // remove the leading and trailing quotes, if present:
                        varString = varString.replaceFirst("^\"*", "");
                        varString = varString.replaceFirst("\"*$", "");
                        caseRow[i] = varString;
                    } else {
                        caseRow[i] = "";
                    }
                } else {
                    // Treat as a String:
                    // Strings are stored in tab files quoted;
                    // Missing values are stored as an empty string
                    // between two tabs (or one tab and the new line);
                    // Empty strings stored as "" (quoted empty string).
                    // For the purposes  of this CSV ingest reader, we are going
                    // to assume that all the empty strings in the file are
                    // indeed empty strings, and NOT missing values:
                    if (varString != null) {
                        // escape the quotes, newlines, and tabs:
                        varString = varString.replace("\"", "\\\"");
                        varString = varString.replace("\n", "\\n");
                        varString = varString.replace("\t", "\\t");
                        // final pair of quotes:
                        varString = "\"" + varString + "\"";
                        caseRow[i] = varString;
                    } else {
                        caseRow[i] = "\"\"";
                    }
                }
            }
            finalOut.println(StringUtils.join(caseRow, "\t"));
        }
    }
    long linecount = parser.getRecordNumber();
    finalOut.close();
    parser.close();
    dbglog.fine("Tmp File: " + firstPassTempFile);
    // Firstpass file is deleted to prevent tmp from filling up.
    firstPassTempFile.delete();
    if (dataTable.getCaseQuantity().intValue() != linecount) {
        List<String> args = Arrays
                .asList(new String[] { "" + dataTable.getCaseQuantity().intValue(), "" + linecount });
        throw new IOException(BundleUtil.getStringFromBundle("ingest.csv.line_mismatch", args));
    }
    return (int) linecount;
}

From source file:norbert.mynemo.dataimport.scraping.CkMapping.java

/**
 * Returns <code>true</code> if the mapping can be created from the given parameters. Returns
 * <code>false</code> otherwise.
 *//*w  w w .  j  a va2 s .  c  o  m*/
public static boolean isValid(CSVRecord record) {
    return record.isConsistent() && record.isMapped(CK_MOVIE_HEADER) && record.isMapped(IMDB_MOVIE_HEADER);
}

From source file:norbert.mynemo.dataimport.scraping.CkRating.java

/**
 * Returns <code>true</code> if the rating can be created from the given parameters. Returns
 * <code>false</code> otherwise.
 *//*  ww w.j av  a2s .c om*/
public static boolean isValid(CSVRecord record) {
    return record != null && record.isConsistent() && record.isMapped(USER_HEADER)
            && record.isMapped(MOVIE_HEADER) && record.isMapped(VALUE_HEADER);
}

From source file:org.apache.phoenix.mapreduce.CsvToKeyValueMapperTest.java

@Test
public void testCsvLineParser() throws IOException {
    CsvToKeyValueMapper.CsvLineParser lineParser = new CsvToKeyValueMapper.CsvLineParser(';', '"', '\\');
    CSVRecord parsed = lineParser.parse("one;two");

    assertEquals("one", parsed.get(0));
    assertEquals("two", parsed.get(1));
    assertTrue(parsed.isConsistent());
    assertEquals(1, parsed.getRecordNumber());
}

From source file:org.apache.phoenix.mapreduce.CsvToKeyValueMapperTest.java

@Test
public void testCsvLineParserWithQuoting() throws IOException {
    CsvToKeyValueMapper.CsvLineParser lineParser = new CsvToKeyValueMapper.CsvLineParser(';', '"', '\\');
    CSVRecord parsed = lineParser.parse("\"\\\"one\";\"\\;two\\\\\"");

    assertEquals("\"one", parsed.get(0));
    assertEquals(";two\\", parsed.get(1));
    assertTrue(parsed.isConsistent());
    assertEquals(1, parsed.getRecordNumber());
}

From source file:org.nuxeo.ecm.directory.DirectoryCSVLoader.java

/**
 * Loads the CSV data file based on the provided schema, and creates the corresponding entries using the provided
 * loader./*from  www  .  j  av a2 s.  co  m*/
 *
 * @param dataFileName the file name containing CSV data
 * @param delimiter the CSV column separator
 * @param schema the data schema
 * @param loader the actual consumer of loaded rows
 * @since 8.4
 */
public static void loadData(String dataFileName, char delimiter, Schema schema,
        Consumer<Map<String, Object>> loader) throws DirectoryException {
    try (InputStream in = getResource(dataFileName); //
            CSVParser csvParser = new CSVParser(new InputStreamReader(in, "UTF-8"),
                    CSVFormat.DEFAULT.withDelimiter(delimiter).withHeader())) {
        Map<String, Integer> header = csvParser.getHeaderMap();

        List<Field> fields = new ArrayList<>();
        for (String columnName : header.keySet()) {
            Field field = schema.getField(columnName.trim());
            if (field == null) {
                throw new DirectoryException(
                        "Column not found: " + columnName + " in schema: " + schema.getName());
            }
            fields.add(field);
        }

        int lineno = 1; // header was first line
        for (CSVRecord record : csvParser) {
            lineno++;
            if (record.size() == 0 || record.size() == 1 && StringUtils.isBlank(record.get(0))) {
                // NXP-2538: allow columns with only one value but skip empty lines
                continue;
            }
            if (!record.isConsistent()) {
                log.error("Invalid column count while reading CSV file: " + dataFileName + ", line: " + lineno
                        + ", values: " + record);
                continue;
            }

            Map<String, Object> map = new HashMap<String, Object>();
            for (int i = 0; i < header.size(); i++) {
                Field field = fields.get(i);
                String value = record.get(i);
                Object v = CSV_NULL_MARKER.equals(value) ? null : decode(field, value);
                map.put(field.getName().getPrefixedName(), v);
            }
            loader.accept(map);
        }
    } catch (IOException e) {
        throw new DirectoryException("Read error while reading data file: " + dataFileName, e);
    }
}

From source file:org.openlmis.fulfillment.Resource2Db.java

Pair<List<String>, List<Object[]>> resourceCsvToBatchedPair(final Resource resource) throws IOException {
    XLOGGER.entry(resource.getDescription());

    // parse CSV// ww w  . j  a v a 2 s.  c  o  m
    try (InputStreamReader isReader = new InputStreamReader(
            new BOMInputStream(resource.getInputStream(), ByteOrderMark.UTF_8))) {
        CSVParser parser = CSVFormat.DEFAULT.withHeader().withNullString("").parse(isReader);

        // read header row
        MutablePair<List<String>, List<Object[]>> readData = new MutablePair<>();
        readData.setLeft(new ArrayList<>(parser.getHeaderMap().keySet()));
        XLOGGER.info("Read header: " + readData.getLeft());

        // read data rows
        List<Object[]> rows = new ArrayList<>();
        for (CSVRecord record : parser.getRecords()) {
            if (!record.isConsistent()) {
                throw new IllegalArgumentException("CSV record inconsistent: " + record);
            }

            List theRow = IteratorUtils.toList(record.iterator());
            rows.add(theRow.toArray());
        }
        readData.setRight(rows);

        XLOGGER.exit("Records read: " + readData.getRight().size());
        return readData;
    }
}