Java tutorial
/* * Copyright (c) 2007-2015 Concurrent, Inc. All Rights Reserved. * * Project and contact information: http://www.cascading.org/ * * This file is part of the Cascading project. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package cascading.tap.hadoop; import java.beans.ConstructorProperties; import java.io.IOException; import java.util.ArrayList; import java.util.List; import cascading.flow.FlowProcess; import cascading.scheme.Scheme; import cascading.tap.MultiSourceTap; import cascading.tap.TapException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.PathFilter; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.RecordReader; /** * Class GlobHfs is a type of {@link cascading.tap.MultiSourceTap} that accepts Hadoop style 'file globing' expressions so * multiple files that match the given pattern may be used as the input sources for a given {@link cascading.flow.Flow}. * <p/> * See {@link FileSystem#globStatus(org.apache.hadoop.fs.Path)} for details on the globing syntax. But in short * it is similar to standard regular expressions except alternation is done via {foo,bar} instead of (foo|bar). * <p/> * Note that a {@link cascading.flow.Flow} sourcing from GlobHfs is not currently compatible with the {@link cascading.cascade.Cascade} * scheduler. GlobHfs expects the files and paths to exist so the wildcards can be resolved into concrete values so * that the scheduler can order the Flows properly. * <p/> * Note that globing can match files or directories. It may consume less resources to match directories and let * Hadoop include all sub-files immediately contained in the directory instead of enumerating every individual file. * Ending the glob path with a {@code /} should match only directories. * * @see Hfs * @see cascading.tap.MultiSourceTap * @see FileSystem */ public class GlobHfs extends MultiSourceTap<Hfs, Configuration, RecordReader> { /** Field pathPattern */ private final String pathPattern; /** Field pathFilter */ private final PathFilter pathFilter; /** * Constructor GlobHfs creates a new GlobHfs instance. * * @param scheme of type Scheme * @param pathPattern of type String */ @ConstructorProperties({ "scheme", "pathPattern" }) public GlobHfs(Scheme<Configuration, RecordReader, ?, ?, ?> scheme, String pathPattern) { this(scheme, pathPattern, null); } /** * Constructor GlobHfs creates a new GlobHfs instance. * * @param scheme of type Scheme * @param pathPattern of type String * @param pathFilter of type PathFilter */ @ConstructorProperties({ "scheme", "pathPattern", "pathFilter" }) public GlobHfs(Scheme<Configuration, RecordReader, ?, ?, ?> scheme, String pathPattern, PathFilter pathFilter) { super(scheme); this.pathPattern = pathPattern; this.pathFilter = pathFilter; } @Override public String getIdentifier() { return pathPattern; } @Override protected Hfs[] getTaps() { return initTapsInternal(new JobConf()); } private Hfs[] initTapsInternal(Configuration conf) { if (taps != null) return taps; try { taps = makeTaps(conf); } catch (IOException exception) { throw new TapException("unable to resolve taps for globing path: " + pathPattern); } return taps; } private Hfs[] makeTaps(Configuration conf) throws IOException { FileStatus[] statusList; Path path = new Path(pathPattern); FileSystem fileSystem = path.getFileSystem(conf); if (pathFilter == null) statusList = fileSystem.globStatus(path); else statusList = fileSystem.globStatus(path, pathFilter); if (statusList == null || statusList.length == 0) throw new TapException("unable to find paths matching path pattern: " + pathPattern); List<Hfs> notEmpty = new ArrayList<Hfs>(); for (int i = 0; i < statusList.length; i++) { // remove empty files. some hadoop versions return non-zero for dirs // so this jives with the expectations set in the above javadoc if (statusList[i].isDir() || statusList[i].getLen() != 0) notEmpty.add(new Hfs(getScheme(), statusList[i].getPath().toString())); } if (notEmpty.isEmpty()) throw new TapException( "all paths matching path pattern are zero length and not directories: " + pathPattern); return notEmpty.toArray(new Hfs[notEmpty.size()]); } @Override public void sourceConfInit(FlowProcess<? extends Configuration> process, Configuration conf) { Hfs[] taps = initTapsInternal(conf); for (Hfs tap : taps) taps[0].sourceConfInitAddInputPath(conf, tap.getPath()); // we are building fully qualified paths above taps[0].sourceConfInitComplete(process, conf); } @Override public boolean equals(Object object) { if (this == object) return true; if (object == null || getClass() != object.getClass()) return false; GlobHfs globHfs = (GlobHfs) object; // do not compare tap arrays, these values should be sufficient to show identity if (getScheme() != null ? !getScheme().equals(globHfs.getScheme()) : globHfs.getScheme() != null) return false; if (pathFilter != null ? !pathFilter.equals(globHfs.pathFilter) : globHfs.pathFilter != null) return false; if (pathPattern != null ? !pathPattern.equals(globHfs.pathPattern) : globHfs.pathPattern != null) return false; return true; } @Override public int hashCode() { int result = pathPattern != null ? pathPattern.hashCode() : 0; result = 31 * result + (pathFilter != null ? pathFilter.hashCode() : 0); return result; } @Override public String toString() { return "GlobHfs[" + pathPattern + ']'; } }