Java tutorial
/* * Copyright (c) 2013 Yolodata, LLC, All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.yolodata.tbana.hadoop.mapred.csv; import com.yolodata.tbana.hadoop.mapred.util.ArrayListTextWritable; import com.yolodata.tbana.hadoop.mapred.util.CSVReader; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.CompressionCodecFactory; import org.apache.hadoop.mapred.FileSplit; import org.apache.hadoop.mapred.InputSplit; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.RecordReader; import java.io.*; import java.util.List; import java.util.zip.ZipInputStream; public class CSVLineRecordReader implements RecordReader<LongWritable, List<Text>> { public static final String FORMAT_DELIMITER = "mapred.csvinput.delimiter"; public static final String FORMAT_SEPARATOR = "mapred.csvinput.separator"; public static final String IS_ZIPFILE = "mapred.csvinput.zipfile"; public static final String DEFAULT_DELIMITER = "\""; public static final String DEFAULT_SEPARATOR = ","; public static final boolean DEFAULT_ZIP = true; private boolean isZipFile; protected InputStream is; protected CSVReader reader; private CompressionCodecFactory compressionCodecs = null; private long start; private long pos; private long end; public CSVLineRecordReader() { } public CSVLineRecordReader(InputStream is, JobConf conf) throws IOException { init(is, conf); } public void init(InputStream is, JobConf conf) throws IOException { this.isZipFile = conf.getBoolean(IS_ZIPFILE, DEFAULT_ZIP); if (isZipFile) { @SuppressWarnings("resource") ZipInputStream zis = new ZipInputStream(new BufferedInputStream(is)); zis.getNextEntry(); is = zis; } this.is = is; this.reader = new CSVReader(new BufferedReader(new InputStreamReader(is))); } public int readLine(List<Text> values) throws IOException { return this.reader.readLine(values); } public void initialize(InputSplit genericSplit, JobConf conf) throws IOException { FileSplit split = (FileSplit) genericSplit; start = split.getStart(); end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(conf); final CompressionCodec codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(conf); FSDataInputStream fileIn = fs.open(split.getPath()); if (codec != null) { is = codec.createInputStream(fileIn); end = Long.MAX_VALUE; } else { if (start != 0) { fileIn.seek(start); } is = fileIn; } this.pos = start; init(is, conf); } @Override public float getProgress() { if (start == end) { return 0.0f; } else { return Math.min(1.0f, (pos - start) / (float) (end - start)); } } public synchronized void close() throws IOException { if (reader != null) { reader.close(); reader = null; } if (is != null) { is.close(); is = null; } } @Override public LongWritable createKey() { return new LongWritable(); } @Override public List<Text> createValue() { return new ArrayListTextWritable(); } @Override public long getPos() throws IOException { return pos; } @Override public boolean next(LongWritable key, List<Text> value) throws IOException { if (key == null) { key = new LongWritable(); } key.set(pos); if (value == null) { value = new ArrayListTextWritable(); } while (true) { if (pos >= end) return false; int newSize = 0; newSize = reader.readLine(value); pos += newSize; if (newSize == 0) { if (isZipFile) { ZipInputStream zis = (ZipInputStream) is; if (zis.getNextEntry() != null) { is = zis; reader = new CSVReader(new BufferedReader(new InputStreamReader(is))); continue; } } key = null; value = null; return false; } else { removeNewLineOnLastColumn(value); return true; } } } private void removeNewLineOnLastColumn(List<Text> value) { String lastColumn = value.get(value.size() - 1).toString(); if (lastColumn.endsWith("\n")) value.set(value.size() - 1, new Text(lastColumn.substring(0, lastColumn.length() - 1))); } }