List of usage examples for org.apache.commons.csv CSVRecord iterator
@Override
public Iterator<String> iterator()
From source file:com.stratio.decision.executables.DataFlowFromCsvMain.java
public static void main(String[] args) throws IOException, NumberFormatException, InterruptedException { if (args.length < 4) { log.info(// w w w.j av a 2s .c om "Usage: \n param 1 - path to file \n param 2 - stream name to send the data \n param 3 - time in ms to wait to send each data \n param 4 - broker list"); } else { Producer<String, String> producer = new Producer<String, String>(createProducerConfig(args[3])); Gson gson = new Gson(); Reader in = new FileReader(args[0]); CSVParser parser = CSVFormat.DEFAULT.parse(in); List<String> columnNames = new ArrayList<>(); for (CSVRecord csvRecord : parser.getRecords()) { if (columnNames.size() == 0) { Iterator<String> iterator = csvRecord.iterator(); while (iterator.hasNext()) { columnNames.add(iterator.next()); } } else { StratioStreamingMessage message = new StratioStreamingMessage(); message.setOperation(STREAM_OPERATIONS.MANIPULATION.INSERT.toLowerCase()); message.setStreamName(args[1]); message.setTimestamp(System.currentTimeMillis()); message.setSession_id(String.valueOf(System.currentTimeMillis())); message.setRequest_id(String.valueOf(System.currentTimeMillis())); message.setRequest("dummy request"); List<ColumnNameTypeValue> sensorData = new ArrayList<>(); for (int i = 0; i < columnNames.size(); i++) { // Workaround Object value = null; try { value = Double.valueOf(csvRecord.get(i)); } catch (NumberFormatException e) { value = csvRecord.get(i); } sensorData.add(new ColumnNameTypeValue(columnNames.get(i), null, value)); } message.setColumns(sensorData); String json = gson.toJson(message); log.info("Sending data: {}", json); producer.send(new KeyedMessage<String, String>(InternalTopic.TOPIC_DATA.getTopicName(), STREAM_OPERATIONS.MANIPULATION.INSERT, json)); log.info("Sleeping {} ms...", args[2]); Thread.sleep(Long.valueOf(args[2])); } } log.info("Program completed."); } }
From source file:edu.caltech.ipac.firefly.server.util.DsvToDataGroup.java
public static DataGroup parse(File inf, CSVFormat format) throws IOException { BufferedReader reader = new BufferedReader(new FileReader(inf), IpacTableUtil.FILE_IO_BUFFER_SIZE); List<DataType> columns = new ArrayList<DataType>(); CSVParser parser = new CSVParser(reader, format); List<CSVRecord> records = parser.getRecords(); if (records != null && records.size() > 0) { // parse the column info CSVRecord cols = records.get(0); for (Iterator<String> itr = cols.iterator(); itr.hasNext();) { String s = itr.next(); if (!StringUtils.isEmpty(s)) { columns.add(new DataType(s, null)); // unknown type }/*from w w w . j av a 2s . c o m*/ } DataGroup dg = new DataGroup(null, columns); // parse the data for (int i = 1; i < records.size(); i++) { DataObject row = parseRow(dg, records.get(i)); if (row != null) { dg.add(row); } } dg.shrinkToFitData(); return dg; } return null; }
From source file:edu.ucla.cs.scai.swim.qa.ontology.dbpedia.tipicality.DbpediaCategoryAttributeCounts.java
private static void processFile(File csvData, String category) throws IOException { BufferedReader in = new BufferedReader( new InputStreamReader(new GZIPInputStream(new FileInputStream(csvData)))); //CSVParser parser = CSVParser.parse(csvData, Charset.defaultCharset(), CSVFormat.RFC4180); CSVParser parser = CSVFormat.EXCEL.parse(in); int r = 0;//from w ww .j ava2 s.c o m ArrayList<Integer> attributePositions = new ArrayList<>(); ArrayList<String> attributeNames = new ArrayList<>(); HashMap<String, Integer> thisCategoryAttributeCounts = new HashMap<>(); categoryAttributeCount.put(category, thisCategoryAttributeCounts); for (CSVRecord csvRecord : parser) { if (r == 0) { Iterator<String> it = csvRecord.iterator(); it.next(); //skip URI if (!it.hasNext()) { //it is an empty file return; } it.next(); //skip rdf-schema#label it.next(); //skip rdf-schema#comment int c = 2; for (; it.hasNext();) { c++; String attr = it.next(); if (!attr.endsWith("_label")) { attributePositions.add(c); } } categories.add(category); } else if (r == 1) { Iterator<String> it = csvRecord.iterator(); it.next(); //skip uri it.next(); //skip rdf-schema#label it.next(); //skip rdf-schema#comment int c = 2; int i = 0; while (i < attributePositions.size()) { c++; String attr = it.next(); if (attributePositions.get(i) == c) { if (!stopAttributes.contains(attr)) { attributes.add(attr); } attributeNames.add(attr); i++; } } } else if (r > 3) { Iterator<String> it = csvRecord.iterator(); String uri = it.next(); /*if (entities.contains(uri)) { System.out.println(uri + " already processed"); continue; }*/ entities.add(uri); it.next(); //skip rdf-schema#label it.next(); //skip rdf-schema#comment int c = 2; int i = 0; while (i < attributePositions.size()) { c++; String val = it.next(); if (attributePositions.get(i) == c) { if (!val.equalsIgnoreCase("null")) { String attribute = attributeNames.get(i); if (!stopAttributes.contains(attribute)) { Integer ac = attributeCount.get(attribute); if (ac == null) { attributeCount.put(attribute, 1); } else { attributeCount.put(attribute, ac + 1); } Integer tcac = thisCategoryAttributeCounts.get(attribute); if (tcac == null) { thisCategoryAttributeCounts.put(attribute, 1); } else { thisCategoryAttributeCounts.put(attribute, tcac + 1); } HashMap<String, Integer> thisAttributeCategoryCounts = attributeCategoryCount .get(attribute); if (thisAttributeCategoryCounts == null) { thisAttributeCategoryCounts = new HashMap<>(); attributeCategoryCount.put(attribute, thisAttributeCategoryCounts); } Integer tacc = thisAttributeCategoryCounts.get(category); if (tacc == null) { thisAttributeCategoryCounts.put(category, 1); } else { thisAttributeCategoryCounts.put(category, tacc + 1); } } } i++; } } } r++; } categoryCount.put(category, r - 3); }
From source file:edu.ucla.cs.scai.swim.qa.ontology.dbpedia.tipicality.Test.java
private static ArrayList<HashSet<String>> extractEntities(File csvData, int nOfAttributes) throws IOException { CSVParser parser = CSVParser.parse(csvData, Charset.defaultCharset(), CSVFormat.RFC4180); int r = 0;/*from www. jav a 2 s . co m*/ ArrayList<Integer> attributePositions = new ArrayList<>(); ArrayList<String> attributeNames = new ArrayList<>(); ArrayList<HashSet<String>> res = new ArrayList<>(); for (CSVRecord csvRecord : parser) { if (r == 0) { Iterator<String> it = csvRecord.iterator(); it.next(); //skip URI if (!it.hasNext()) { //it is an empty file return res; } it.next(); //skip rdf-schema#label it.next(); //skip rdf-schema#comment int c = 2; for (; it.hasNext();) { c++; String attr = it.next(); if (!attr.endsWith("_label")) { attributePositions.add(c); } } } else if (r == 1) { Iterator<String> it = csvRecord.iterator(); it.next(); //skip uri it.next(); //skip rdf-schema#label it.next(); //skip rdf-schema#comment int c = 2; int i = 0; while (i < attributePositions.size()) { c++; String attr = it.next(); if (attributePositions.get(i) == c) { if (!stopAttributes.contains(attr)) { attributes.add(attr); } attributeNames.add(attr); i++; } } } else if (r > 3) { ArrayList<String> attributesOfThisEntity = new ArrayList<>(); Iterator<String> it = csvRecord.iterator(); String uri = it.next(); it.next(); //skip rdf-schema#label it.next(); //skip rdf-schema#comment int c = 2; int i = 0; while (i < attributePositions.size()) { c++; String val = it.next(); if (attributePositions.get(i) == c) { if (!val.equalsIgnoreCase("null")) { String attribute = attributeNames.get(i); if (!stopAttributes.contains(attribute)) { attributesOfThisEntity.add(attribute); } } i++; } } Collections.shuffle(attributesOfThisEntity); HashSet<String> s = new HashSet<>(); for (int k = 0; k < Math.min(nOfAttributes, attributesOfThisEntity.size()); k++) { s.add(attributesOfThisEntity.get(k)); } res.add(s); } r++; } return res; }
From source file:cma.fa.tc.impl.utils.files.SimpleCsvReader.java
@Override public List<List<String>> read() { List<List<String>> result = new ArrayList<>(); try (BufferedReader in = new BufferedReader( new InputStreamReader(this.getClass().getResourceAsStream(this.path), this.charset))) { Iterable<CSVRecord> parser = CSVFormat.EXCEL.parse(in); for (CSVRecord record : parser) { result.add(Lists.newArrayList(record.iterator())); }/* w w w. ja va2s. co m*/ } catch (Exception ex) { log.error("Cannot read {}", ex); } return result; }
From source file:com.willwinder.universalgcodesender.utils.GrblLookups.java
public GrblLookups(String prefix) { String filename = prefix + "_" + Localization.loadedLocale() + ".csv"; URL u = GrblLookups.class.getResource(pathFor(filename)); if (u == null) { filename = prefix + "_en_US.csv"; }//from ww w . j ava 2 s. co m try { try (BufferedReader reader = new BufferedReader( new InputStreamReader(GrblLookups.class.getResourceAsStream(pathFor(filename))))) { Iterable<CSVRecord> records = CSVFormat.RFC4180.withFirstRecordAsHeader().parse(reader); for (CSVRecord record : records) { List<String> list = Lists.newArrayList(record.iterator()); lookups.put(record.get(0), list.toArray(new String[0])); } } } catch (IOException ex) { System.out.println("Unable to load GRBL resources."); ex.printStackTrace(); } }
From source file:citation_prediction.CitationCore.java
/** * Fix the citation data, which is in years by translating the timestamps and citations to be in days. * /*from w ww.j a v a 2s . co m*/ * @param record The citation history in years. * @param limitToRows Limit the rows being processed. * @return The citation history in days. */ private static double[][] fixData(CSVRecord record, int limitToRows) { double[][] r = null; int citationCount = 0; int numberOfRowsToProcess = 0; Iterator<String> record_iterator = record.iterator(); record_iterator.next(); //move pass paper id record_iterator.next(); //move pass paper publish year if (limitToRows != 0) { numberOfRowsToProcess = limitToRows; r = new double[numberOfRowsToProcess + 1][2]; } else { numberOfRowsToProcess = record.size() - 2; r = new double[record.size() - 2][2]; } for (int rowIndex = 0; record_iterator.hasNext() && rowIndex < numberOfRowsToProcess; rowIndex++) { String citations_forthis_year = record_iterator.next(); r[rowIndex][0] = Double.valueOf(rowIndex); //timestamp r[rowIndex][1] = Double.valueOf(citations_forthis_year); //citation citationCount += r[rowIndex][1]; } return fixData(r, citationCount); }
From source file:biz.ganttproject.impex.csv.RecordGroup.java
boolean isHeader(CSVRecord record) { Set<String> thoseFields = Sets.newHashSet(); for (Iterator<String> it = record.iterator(); it.hasNext();) { thoseFields.add(it.next());//from ww w . j av a2 s. c o m } return thoseFields.containsAll(myMandatoryFields); }
From source file:biz.ganttproject.impex.csv.RecordGroup.java
boolean process(CSVRecord record) { assert record.size() > 0; boolean allEmpty = true; for (Iterator<String> it = record.iterator(); it.hasNext();) { if (!Strings.isNullOrEmpty(it.next())) { allEmpty = false;/* www . java2s .co m*/ break; } } if (allEmpty) { return false; } try { return doProcess(record); } catch (Throwable e) { GPLogger.getLogger(GanttCSVOpen.class).log(Level.WARNING, String.format("Failed to process record:\n%s", record), e); return false; } }
From source file:com.marklogic.contentpump.DelimitedTextInputFormat.java
public List<InputSplit> getSplits(JobContext job) throws IOException { boolean delimSplit = isSplitInput(job.getConfiguration()); //if delimSplit is true, size of each split is determined by //Math.max(minSize, Math.min(maxSize, blockSize)) in FileInputFormat List<InputSplit> splits = super.getSplits(job); if (!delimSplit) { return splits; }// ww w .j av a 2 s. c o m if (splits.size() >= SPLIT_COUNT_LIMIT) { //if #splits > 1 million, there is enough parallelism //therefore no point to split LOG.warn("Exceeding SPLIT_COUNT_LIMIT, input_split is off:" + SPLIT_COUNT_LIMIT); DefaultStringifier.store(job.getConfiguration(), false, ConfigConstants.CONF_SPLIT_INPUT); return splits; } // add header info into splits List<InputSplit> populatedSplits = new ArrayList<InputSplit>(); LOG.info(splits.size() + " DelimitedSplits generated"); Configuration conf = job.getConfiguration(); char delimiter = 0; ArrayList<Text> hlist = new ArrayList<Text>(); for (InputSplit file : splits) { FileSplit fsplit = ((FileSplit) file); Path path = fsplit.getPath(); FileSystem fs = path.getFileSystem(conf); if (fsplit.getStart() == 0) { // parse the inSplit, get the header FSDataInputStream fileIn = fs.open(path); String delimStr = conf.get(ConfigConstants.CONF_DELIMITER, ConfigConstants.DEFAULT_DELIMITER); if (delimStr.length() == 1) { delimiter = delimStr.charAt(0); } else { LOG.error("Incorrect delimitor: " + delimiter + ". Expects single character."); } String encoding = conf.get(MarkLogicConstants.OUTPUT_CONTENT_ENCODING, MarkLogicConstants.DEFAULT_OUTPUT_CONTENT_ENCODING); InputStreamReader instream = new InputStreamReader(fileIn, encoding); CSVParser parser = new CSVParser(instream, CSVParserFormatter.getFormat(delimiter, DelimitedTextReader.encapsulator, true, true)); Iterator<CSVRecord> it = parser.iterator(); String[] header = null; if (it.hasNext()) { CSVRecord record = (CSVRecord) it.next(); Iterator<String> recordIterator = record.iterator(); int recordSize = record.size(); header = new String[recordSize]; for (int i = 0; i < recordSize; i++) { if (recordIterator.hasNext()) { header[i] = (String) recordIterator.next(); } else { throw new IOException("Record size doesn't match the real size"); } } EncodingUtil.handleBOMUTF8(header, 0); hlist.clear(); for (String s : header) { hlist.add(new Text(s)); } } instream.close(); } DelimitedSplit ds = new DelimitedSplit(new TextArrayWritable(hlist.toArray(new Text[hlist.size()])), path, fsplit.getStart(), fsplit.getLength(), fsplit.getLocations()); populatedSplits.add(ds); } return populatedSplits; }