List of usage examples for org.apache.hadoop.mapreduce JobContext getCredentials
public Credentials getCredentials();
From source file:com.dinglicom.clouder.mapreduce.input.FileInputFormat.java
License:Apache License
/** List input directories. * Subclasses may override to, e.g., select only files matching a regular * expression. /*w w w . jav a2 s.c o m*/ * * @param job the job to list input paths for * @return array of FileStatus objects * @throws IOException if zero items. */ protected List<FileStatus> listStatus(JobContext job) throws IOException { List<FileStatus> result = new ArrayList<FileStatus>(); Path[] dirs = getInputPaths(job); if (dirs.length == 0) { throw new IOException("No input paths specified in job"); } // get tokens for all the required FileSystems.. TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job.getConfiguration()); List<IOException> errors = new ArrayList<IOException>(); // creates a MultiPathFilter with the hiddenFileFilter and the // user provided one (if any). List<PathFilter> filters = new ArrayList<PathFilter>(); filters.add(hiddenFileFilter); PathFilter jobFilter = getInputPathFilter(job); if (jobFilter != null) { filters.add(jobFilter); } PathFilter inputFilter = new MultiPathFilter(filters); for (int i = 0; i < dirs.length; ++i) { Path p = dirs[i]; FileSystem fs = p.getFileSystem(job.getConfiguration()); FileStatus[] matches = fs.globStatus(p, inputFilter); if (matches == null) { errors.add(new IOException("Input path does not exist: " + p)); } else if (matches.length == 0) { errors.add(new IOException("Input Pattern " + p + " matches 0 files")); } else { for (FileStatus globStat : matches) { if (globStat.isDirectory()) { for (FileStatus stat : fs.listStatus(globStat.getPath(), inputFilter)) { result.add(stat); } } else { result.add(globStat); } } } } if (!errors.isEmpty()) { throw new InvalidInputException(errors); } LOG.info("Total input paths to process : " + result.size()); return result; }
From source file:com.ikanow.aleph2.analytics.hadoop.assets.UpdatedFileInputFormat.java
License:Apache License
/** List input directories. * Subclasses may override to, e.g., select only files matching a regular * expression. // www .j av a 2 s.com * * @param job the job to list input paths for * @return array of FileStatus objects * @throws IOException if zero items. */ protected List<FileStatus> listStatus(JobContext job) throws IOException { Path[] dirs = getInputPaths(job); if (dirs.length == 0) { throw new IOException("No input paths specified in job"); } // get tokens for all the required FileSystems.. TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job.getConfiguration()); // Whether we need to recursive look into the directory structure boolean recursive = getInputDirRecursive(job); // creates a MultiPathFilter with the hiddenFileFilter and the // user provided one (if any). List<PathFilter> filters = new ArrayList<PathFilter>(); filters.add(hiddenFileFilter); PathFilter jobFilter = getInputPathFilter(job); if (jobFilter != null) { filters.add(jobFilter); } PathFilter inputFilter = new MultiPathFilter(filters); List<FileStatus> result = null; int numThreads = job.getConfiguration().getInt(LIST_STATUS_NUM_THREADS, DEFAULT_LIST_STATUS_NUM_THREADS); Stopwatch sw = Stopwatch.createStarted(); if (numThreads == 1) { result = singleThreadedListStatus(job, dirs, inputFilter, recursive); } else { Iterable<FileStatus> locatedFiles = null; try { LocatedFileStatusFetcher locatedFileStatusFetcher = new LocatedFileStatusFetcher( job.getConfiguration(), dirs, recursive, inputFilter, true); locatedFiles = locatedFileStatusFetcher.getFileStatuses(); } catch (InterruptedException e) { throw new IOException("Interrupted while getting file statuses"); } result = Lists.newArrayList(locatedFiles); } sw.stop(); if (LOG.isDebugEnabled()) { LOG.debug("Time taken to get FileStatuses: " + sw.elapsed(TimeUnit.MILLISECONDS)); } LOG.info("Total input paths to process : " + result.size()); return result; }
From source file:com.marklogic.contentpump.FileAndDirectoryInputFormat.java
License:Apache License
protected List<FileStatus> listStatus(JobContext job) throws IOException { Path[] dirs = getInputPaths(job); if (dirs.length == 0) { throw new IOException("No input paths specified in job"); }//from ww w. ja v a 2 s .c om // get tokens for all the required FileSystems.. TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job.getConfiguration()); // Whether we need to recursive look into the directory structure boolean recursive = getInputDirRecursive(job); // creates a MultiPathFilter with the hiddenFileFilter and the // user provided one (if any). List<PathFilter> filters = new ArrayList<PathFilter>(); filters.add(hiddenFileFilter); PathFilter jobFilter = getInputPathFilter(job); if (jobFilter != null) { filters.add(jobFilter); } PathFilter inputFilter = new MultiPathFilter(filters); List<FileStatus> result = simpleListStatus(job, dirs, inputFilter, recursive); LOG.info("Total input paths to process : " + result.size()); return result; }
From source file:com.ning.metrics.serialization.hadoop.SmileInputFormat.java
License:Apache License
/** * List input directories./*from ww w .ja va 2 s . c o m*/ * * @param job the job to list input paths for * @return array of FileStatus objects * @throws IOException if zero items. */ protected List<FileStatus> listStatus(JobContext job) throws IOException { List<FileStatus> result = new ArrayList<FileStatus>(); Path[] dirs = getInputPaths(job); if (dirs.length == 0) { throw new IOException("No input paths specified in job"); } // Get tokens for all the required FileSystems.. TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job.getConfiguration()); List<IOException> errors = new ArrayList<IOException>(); for (Path p : dirs) { FileSystem fs = p.getFileSystem(job.getConfiguration()); final SmilePathFilter filter = new SmilePathFilter(); FileStatus[] matches = fs.globStatus(p, filter); if (matches == null) { errors.add(new IOException("Input path does not exist: " + p)); } else if (matches.length == 0) { errors.add(new IOException("Input Pattern " + p + " matches 0 files")); } else { for (FileStatus globStat : matches) { if (globStat.isDir()) { Collections.addAll(result, fs.listStatus(globStat.getPath(), filter)); } else { result.add(globStat); } } } } if (!errors.isEmpty()) { throw new InvalidInputException(errors); } return result; }
From source file:com.sourcecode.FileInputFormat.java
License:Apache License
/** List input directories. * Subclasses may override to, e.g., select only files matching a regular * expression. /*from w ww .ja v a2 s . co m*/ * * @param job the job to list input paths for * @return array of FileStatus objects * @throws IOException if zero items. */ protected List<FileStatus> listStatus(JobContext job) throws IOException { Path[] dirs = getInputPaths(job); if (dirs.length == 0) { throw new IOException("No input paths specified in job"); } // get tokens for all the required FileSystems.. TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job.getConfiguration()); // Whether we need to recursive look into the directory structure boolean recursive = getInputDirRecursive(job); // creates a MultiPathFilter with the hiddenFileFilter and the // user provided one (if any). List<PathFilter> filters = new ArrayList<PathFilter>(); filters.add(hiddenFileFilter); PathFilter jobFilter = getInputPathFilter(job); if (jobFilter != null) { filters.add(jobFilter); } PathFilter inputFilter = new MultiPathFilter(filters); List<FileStatus> result = null; int numThreads = job.getConfiguration().getInt(LIST_STATUS_NUM_THREADS, DEFAULT_LIST_STATUS_NUM_THREADS); Stopwatch sw = new Stopwatch().start(); if (numThreads == 1) { result = singleThreadedListStatus(job, dirs, inputFilter, recursive); } else { Iterable<FileStatus> locatedFiles = null; try { LocatedFileStatusFetcher locatedFileStatusFetcher = new LocatedFileStatusFetcher( job.getConfiguration(), dirs, recursive, inputFilter, true); locatedFiles = locatedFileStatusFetcher.getFileStatuses(); } catch (InterruptedException e) { throw new IOException("Interrupted while getting file statuses"); } result = Lists.newArrayList(locatedFiles); } sw.stop(); if (LOG.isDebugEnabled()) { LOG.debug("Time taken to get FileStatuses: " + sw.elapsedMillis()); } LOG.info("Total input paths to process : " + result.size()); return result; }
From source file:info.halo9pan.word2vec.hadoop.mr.SortOutputFormat.java
License:Apache License
@Override public void checkOutputSpecs(JobContext job) throws InvalidJobConfException, IOException { // Ensure that the output directory is set Path outDir = getOutputPath(job); if (outDir == null) { throw new InvalidJobConfException("Output directory not set in JobConf."); }/* w w w. j a v a 2s . co m*/ final Configuration jobConf = job.getConfiguration(); // get delegation token for outDir's file system TokenCache.obtainTokensForNamenodes(job.getCredentials(), new Path[] { outDir }, jobConf); final FileSystem fs = outDir.getFileSystem(jobConf); if (fs.exists(outDir)) { // existing output dir is considered empty iff its only content is the // partition file. // final FileStatus[] outDirKids = fs.listStatus(outDir); boolean empty = false; if (outDirKids != null && outDirKids.length == 1) { final FileStatus st = outDirKids[0]; final String fname = st.getPath().getName(); empty = !st.isDirectory() && SortInputFormat.PARTITION_FILENAME.equals(fname); } if (WordSort.getUseSimplePartitioner(job) || !empty) { throw new FileAlreadyExistsException("Output directory " + outDir + " already exists"); } } }
From source file:info.halo9pan.word2vec.hadoop.terasort.TeraOutputFormat.java
License:Apache License
@Override public void checkOutputSpecs(JobContext job) throws InvalidJobConfException, IOException { // Ensure that the output directory is set Path outDir = getOutputPath(job); if (outDir == null) { throw new InvalidJobConfException("Output directory not set in JobConf."); }//from w ww .j a va 2 s . c o m final Configuration jobConf = job.getConfiguration(); // get delegation token for outDir's file system TokenCache.obtainTokensForNamenodes(job.getCredentials(), new Path[] { outDir }, jobConf); final FileSystem fs = outDir.getFileSystem(jobConf); if (fs.exists(outDir)) { // existing output dir is considered empty iff its only content is the // partition file. // final FileStatus[] outDirKids = fs.listStatus(outDir); boolean empty = false; if (outDirKids != null && outDirKids.length == 1) { final FileStatus st = outDirKids[0]; final String fname = st.getPath().getName(); empty = !st.isDirectory() && TeraInputFormat.PARTITION_FILENAME.equals(fname); } if (TeraSort.getUseSimplePartitioner(job) || !empty) { throw new FileAlreadyExistsException("Output directory " + outDir + " already exists"); } } }
From source file:io.amient.kafka.hadoop.io.MultiOutputFormat.java
License:Apache License
public void checkOutputSpecs(JobContext job) throws IOException { // Ensure that the output directory is set and not already there Path outDir = getOutputPath(job); if (outDir == null) { throw new InvalidJobConfException("Output directory not set."); }//from w ww .j a va 2 s .c o m // get delegation token for outDir's file system TokenCache.obtainTokensForNamenodes(job.getCredentials(), new Path[] { outDir }, job.getConfiguration()); }
From source file:kogiri.mapreduce.common.kmermatch.KmerMatchInputFormat.java
License:Open Source License
@Override protected List<FileStatus> listStatus(JobContext job) throws IOException { List<FileStatus> result = new ArrayList<FileStatus>(); Path[] dirs = getInputPaths(job); if (dirs.length == 0) { throw new IOException("No input paths specified in job"); }/*from ww w . ja v a2s .co m*/ // get tokens for all the required FileSystems.. TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job.getConfiguration()); // creates a MultiPathFilter with the hiddenFileFilter and the // user provided one (if any). List<PathFilter> filters = new ArrayList<PathFilter>(); PathFilter jobFilter = getInputPathFilter(job); if (jobFilter != null) { filters.add(jobFilter); } filters.add(new KmerIndexIndexPathFilter()); PathFilter inputFilter = new MultiPathFilter(filters); for (int i = 0; i < dirs.length; ++i) { Path p = dirs[i]; if (inputFilter.accept(p)) { FileSystem fs = p.getFileSystem(job.getConfiguration()); FileStatus status = fs.getFileStatus(p); result.add(status); } } LOG.info("Total input paths to process : " + result.size()); return result; }
From source file:kogiri.mapreduce.preprocess.common.kmerindex.KmerIndexInputFormat.java
License:Open Source License
@Override protected List<FileStatus> listStatus(JobContext job) throws IOException { List<FileStatus> result = new ArrayList<FileStatus>(); Path[] dirs = getInputPaths(job); if (dirs.length == 0) { throw new IOException("No input paths specified in job"); }/*from w w w .j av a2 s . co m*/ // get tokens for all the required FileSystems.. TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job.getConfiguration()); // creates a MultiPathFilter with the hiddenFileFilter and the // user provided one (if any). List<PathFilter> filters = new ArrayList<PathFilter>(); PathFilter jobFilter = getInputPathFilter(job); if (jobFilter != null) { filters.add(jobFilter); } filters.add(new KmerIndexPartPathFilter()); PathFilter inputFilter = new MultiPathFilter(filters); for (int i = 0; i < dirs.length; ++i) { Path p = dirs[i]; if (inputFilter.accept(p)) { FileSystem fs = p.getFileSystem(job.getConfiguration()); FileStatus status = fs.getFileStatus(p); result.add(status); } } LOG.info("Total input paths to process : " + result.size()); return result; }