Java tutorial
/* * Copyright (c) 2007-2010 Concurrent, Inc. All Rights Reserved. * * Project and contact information: http://www.cascading.org/ * * This file is part of the Cascading project. * * Cascading is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Cascading is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Cascading. If not, see <http://www.gnu.org/licenses/>. */ package cascading.tap.hadoop; import java.io.IOException; import cascading.tap.Tap; import cascading.tap.TapException; import cascading.tuple.Tuple; import cascading.tuple.TupleEntry; import cascading.tuple.TupleEntryCollector; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.OutputFormat; import org.apache.hadoop.mapred.RecordWriter; import org.apache.hadoop.mapred.Reporter; import org.apache.log4j.Logger; /** * Class TapCollector is a kind of {@link cascading.tuple.TupleEntryCollector} that writes tuples to the resource managed by * a particular {@link cascading.tap.Tap} instance. */ public class TapCollector extends TupleEntryCollector implements OutputCollector { /** Field LOG */ private static final Logger LOG = Logger.getLogger(TapCollector.class); /** Field conf */ private JobConf conf; /** Field writer */ private RecordWriter writer; /** Field filenamePattern */ private String filenamePattern = "%s%spart-%05d"; /** Field filename */ private String filename; /** Field tap */ private Tap tap; /** Field prefix */ private String prefix; /** Field outputEntry */ private TupleEntry outputEntry; /** Field isFileOutputFormat */ private boolean isFileOutputFormat; /** Field reporter */ private Reporter reporter = Reporter.NULL; /** * Constructor TapCollector creates a new TapCollector instance. * * @param tap of type Tap * @param conf of type JobConf * @throws IOException when fails to initialize */ public TapCollector(Tap tap, JobConf conf) throws IOException { this(tap, null, conf); } /** * Constructor TapCollector creates a new TapCollector instance. * * @param tap of type Tap * @param prefix of type String * @param conf of type JobConf * @throws IOException when fails to initialize */ public TapCollector(Tap tap, String prefix, JobConf conf) throws IOException { this.tap = tap; this.prefix = prefix == null || prefix.length() == 0 ? null : prefix; this.conf = new JobConf(conf); this.outputEntry = new TupleEntry(tap.getSinkFields()); this.filenamePattern = conf.get("cascading.tapcollector.partname", this.filenamePattern); initalize(); } private void initalize() throws IOException { tap.sinkInit(conf); // tap should not delete if called within a task OutputFormat outputFormat = conf.getOutputFormat(); isFileOutputFormat = outputFormat instanceof FileOutputFormat; if (isFileOutputFormat) { Hadoop18TapUtil.setupJob(conf); if (prefix != null) filename = String.format(filenamePattern, prefix, "/", conf.getInt("mapred.task.partition", 0)); else filename = String.format(filenamePattern, "", "", conf.getInt("mapred.task.partition", 0)); Hadoop18TapUtil.setupTask(conf); } writer = outputFormat.getRecordWriter(null, conf, filename, Reporter.NULL); } public void setReporter(Reporter reporter) { this.reporter = reporter; } protected void collect(Tuple tuple) { try { outputEntry.setTuple(tuple); tap.sink(outputEntry, this); } catch (IOException exception) { throw new TapException("unable to write to: " + filename, exception); } } @Override public void close() { try { if (isFileOutputFormat) LOG.info("closing tap collector for: " + new Path(tap.getPath(), filename)); else LOG.info("closing tap collector for: " + tap.toString()); try { writer.close(reporter); } finally { if (isFileOutputFormat) { if (Hadoop18TapUtil.needsTaskCommit(conf)) Hadoop18TapUtil.commitTask(conf); Hadoop18TapUtil.cleanupJob(conf); } } } catch (IOException exception) { LOG.warn("exception closing: " + filename, exception); throw new TapException("exception closing: " + filename, exception); } } /** * Method collect writes the given values to the {@link Tap} this instance encapsulates. * * @param writableComparable of type WritableComparable * @param writable of type Writable * @throws IOException when */ public void collect(Object writableComparable, Object writable) throws IOException { reporter.progress(); writer.write(writableComparable, writable); } }