Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.crunch.io.impl; import java.io.IOException; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import com.google.common.collect.ImmutableMap; import org.apache.commons.lang.builder.HashCodeBuilder; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.crunch.CrunchRuntimeException; import org.apache.crunch.SourceTarget; import org.apache.crunch.Target; import org.apache.crunch.impl.mr.plan.PlanningParameters; import org.apache.crunch.io.CrunchOutputs; import org.apache.crunch.io.FileNamingScheme; import org.apache.crunch.io.FormatBundle; import org.apache.crunch.io.OutputHandler; import org.apache.crunch.io.PathTarget; import org.apache.crunch.io.SourceTargetHelper; import org.apache.crunch.types.Converter; import org.apache.crunch.types.PType; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class FileTargetImpl implements PathTarget { private static final Log LOG = LogFactory.getLog(FileTargetImpl.class); protected final Path path; private final FormatBundle<? extends FileOutputFormat> formatBundle; private final FileNamingScheme fileNamingScheme; public FileTargetImpl(Path path, Class<? extends FileOutputFormat> outputFormatClass, FileNamingScheme fileNamingScheme) { this(path, outputFormatClass, fileNamingScheme, ImmutableMap.<String, String>of()); } public FileTargetImpl(Path path, Class<? extends FileOutputFormat> outputFormatClass, FileNamingScheme fileNamingScheme, Map<String, String> extraConf) { this.path = path; this.formatBundle = FormatBundle.forOutput(outputFormatClass); this.fileNamingScheme = fileNamingScheme; if (extraConf != null && !extraConf.isEmpty()) { for (Map.Entry<String, String> e : extraConf.entrySet()) { formatBundle.set(e.getKey(), e.getValue()); } } } @Override public Target outputConf(String key, String value) { formatBundle.set(key, value); return this; } @Override public void configureForMapReduce(Job job, PType<?> ptype, Path outputPath, String name) { Converter converter = ptype.getConverter(); Class keyClass = converter.getKeyClass(); Class valueClass = converter.getValueClass(); configureForMapReduce(job, keyClass, valueClass, formatBundle, outputPath, name); } @Deprecated protected void configureForMapReduce(Job job, Class keyClass, Class valueClass, Class outputFormatClass, Path outputPath, String name) { configureForMapReduce(job, keyClass, valueClass, FormatBundle.forOutput(outputFormatClass), outputPath, name); } protected void configureForMapReduce(Job job, Class keyClass, Class valueClass, FormatBundle formatBundle, Path outputPath, String name) { try { FileOutputFormat.setOutputPath(job, outputPath); } catch (Exception e) { throw new RuntimeException(e); } if (name == null) { job.setOutputFormatClass(formatBundle.getFormatClass()); formatBundle.configure(job.getConfiguration()); job.setOutputKeyClass(keyClass); job.setOutputValueClass(valueClass); } else { CrunchOutputs.addNamedOutput(job, name, formatBundle, keyClass, valueClass); } } @Override public boolean accept(OutputHandler handler, PType<?> ptype) { handler.configure(this, ptype); return true; } @Override public Converter<?, ?, ?, ?> getConverter(PType<?> ptype) { return ptype.getConverter(); } @Override public void handleOutputs(Configuration conf, Path workingPath, int index) throws IOException { FileSystem srcFs = workingPath.getFileSystem(conf); Path src = getSourcePattern(workingPath, index); Path[] srcs = FileUtil.stat2Paths(srcFs.globStatus(src), src); FileSystem dstFs = path.getFileSystem(conf); if (!dstFs.exists(path)) { dstFs.mkdirs(path); } boolean sameFs = isCompatible(srcFs, path); for (Path s : srcs) { Path d = getDestFile(conf, s, path, s.getName().contains("-m-")); if (sameFs) { srcFs.rename(s, d); } else { FileUtil.copy(srcFs, s, dstFs, d, true, true, conf); } } dstFs.create(getSuccessIndicator(), true).close(); } private Path getSuccessIndicator() { return new Path(path, "_SUCCESS"); } protected Path getSourcePattern(Path workingPath, int index) { return new Path(workingPath, PlanningParameters.MULTI_OUTPUT_PREFIX + index + "-*"); } @Override public Path getPath() { return path; } protected static boolean isCompatible(FileSystem fs, Path path) { try { fs.makeQualified(path); return true; } catch (IllegalArgumentException e) { return false; } } protected Path getDestFile(Configuration conf, Path src, Path dir, boolean mapOnlyJob) throws IOException { String outputFilename = null; String sourceFilename = src.getName(); if (mapOnlyJob) { outputFilename = getFileNamingScheme().getMapOutputName(conf, dir); } else { outputFilename = getFileNamingScheme().getReduceOutputName(conf, dir, extractPartitionNumber(sourceFilename)); } if (sourceFilename.contains(".")) { outputFilename += sourceFilename.substring(sourceFilename.indexOf(".")); } return new Path(dir, outputFilename); } /** * Extract the partition number from a raw reducer output filename. * * @param reduceOutputFileName The raw reducer output file name * @return The partition number encoded in the filename */ public static int extractPartitionNumber(String reduceOutputFileName) { Matcher matcher = Pattern.compile(".*-r-(\\d{5})").matcher(reduceOutputFileName); if (matcher.find()) { return Integer.parseInt(matcher.group(1), 10); } else { throw new IllegalArgumentException( "Reducer output name '" + reduceOutputFileName + "' cannot be parsed"); } } @Override public FileNamingScheme getFileNamingScheme() { return fileNamingScheme; } @Override public boolean equals(Object other) { if (other == null || !getClass().equals(other.getClass())) { return false; } FileTargetImpl o = (FileTargetImpl) other; return path.equals(o.path); } @Override public int hashCode() { return new HashCodeBuilder().append(path).toHashCode(); } @Override public String toString() { return new StringBuilder().append(formatBundle.getFormatClass().getSimpleName()).append("(").append(path) .append(")").toString(); } @Override public <T> SourceTarget<T> asSourceTarget(PType<T> ptype) { // By default, assume that we cannot do this. return null; } @Override public boolean handleExisting(WriteMode strategy, long lastModForSource, Configuration conf) { FileSystem fs = null; try { fs = path.getFileSystem(conf); } catch (IOException e) { LOG.error("Could not retrieve FileSystem object to check for existing path", e); throw new CrunchRuntimeException(e); } boolean exists = false; boolean successful = false; long lastModForTarget = -1; try { exists = fs.exists(path); if (exists) { successful = fs.exists(getSuccessIndicator()); lastModForTarget = SourceTargetHelper.getLastModifiedAt(fs, path); } } catch (IOException e) { LOG.error("Exception checking existence of path: " + path, e); throw new CrunchRuntimeException(e); } if (exists) { switch (strategy) { case DEFAULT: LOG.error("Path " + path + " already exists!"); throw new CrunchRuntimeException("Path already exists: " + path); case OVERWRITE: LOG.info("Removing data at existing path: " + path); try { fs.delete(path, true); } catch (IOException e) { LOG.error("Exception thrown removing data at path: " + path, e); } break; case APPEND: LOG.info("Adding output files to existing path: " + path); break; case CHECKPOINT: if (successful && lastModForTarget > lastModForSource) { LOG.info("Re-starting pipeline from checkpoint path: " + path); break; } else { if (!successful) { LOG.info("_SUCCESS file not found, Removing data at existing checkpoint path: " + path); } else { LOG.info("Source data has recent updates. Removing data at existing checkpoint path: " + path); } try { fs.delete(path, true); } catch (IOException e) { LOG.error("Exception thrown removing data at checkpoint path: " + path, e); } return false; } default: throw new CrunchRuntimeException("Unknown WriteMode: " + strategy); } } else { LOG.info("Will write output files to new path: " + path); } return exists; } }