List of usage examples for org.apache.hadoop.fs FileSystem globStatus
public FileStatus[] globStatus(Path pathPattern, PathFilter filter) throws IOException
From source file:org.apache.hama.bsp.FileInputFormat.java
License:Apache License
/** * List input directories. Subclasses may override to, e.g., select only files * matching a regular expression./*from www . ja v a2 s. c o m*/ * * @param job the job to list input paths for * @return array of FileStatus objects * @throws IOException if zero items. */ protected FileStatus[] listStatus(BSPJob job) throws IOException { Path[] dirs = getInputPaths(job); if (dirs.length == 0) { throw new IOException("No input paths specified in job"); } List<FileStatus> result = new ArrayList<FileStatus>(); List<IOException> errors = new ArrayList<IOException>(); // creates a MultiPathFilter with the hiddenFileFilter and the // user provided one (if any). List<PathFilter> filters = new ArrayList<PathFilter>(); filters.add(hiddenFileFilter); PathFilter jobFilter = getInputPathFilter(job); if (jobFilter != null) { filters.add(jobFilter); } PathFilter inputFilter = new MultiPathFilter(filters); for (Path p : dirs) { FileSystem fs = p.getFileSystem(job.getConfiguration()); FileStatus[] matches = null; try { matches = fs.globStatus(p, inputFilter); } catch (Exception e) { LOG.info(p + "\n" + e.toString()); } if (matches == null) { errors.add(new IOException("Input path does not exist: " + p)); } else if (matches.length == 0) { errors.add(new IOException("Input Pattern " + p + " matches 0 files")); } else { for (FileStatus globStat : matches) { if (globStat.isDir()) { Collections.addAll(result, fs.listStatus(globStat.getPath(), inputFilter)); } else { result.add(globStat); } } } } if (!errors.isEmpty()) { throw new InvalidInputException(errors); } LOG.info("Total input paths to process : " + result.size()); return result.toArray(new FileStatus[result.size()]); }
From source file:org.apache.hcatalog.mapreduce.FileOutputFormatContainer.java
License:Apache License
/** * Handles duplicate publish of partition. Fails if partition already exists. * For non partitioned tables, fails if files are present in table directory. * For dynamic partitioned publish, does nothing - check would need to be done at recordwriter time * @param context the job//from w w w. j a v a 2 s .c o m * @param outputInfo the output info * @param client the metastore client * @param table the table being written to * @throws IOException * @throws org.apache.hadoop.hive.metastore.api.MetaException * @throws org.apache.thrift.TException */ private static void handleDuplicatePublish(JobContext context, OutputJobInfo outputInfo, HiveMetaStoreClient client, Table table) throws IOException, MetaException, TException, NoSuchObjectException { /* * For fully specified ptn, follow strict checks for existence of partitions in metadata * For unpartitioned tables, follow filechecks * For partially specified tables: * This would then need filechecks at the start of a ptn write, * Doing metadata checks can get potentially very expensive (fat conf) if * there are a large number of partitions that match the partial specifications */ if (table.getPartitionKeys().size() > 0) { if (!outputInfo.isDynamicPartitioningUsed()) { List<String> partitionValues = getPartitionValueList(table, outputInfo.getPartitionValues()); // fully-specified partition List<String> currentParts = client.listPartitionNames(outputInfo.getDatabaseName(), outputInfo.getTableName(), partitionValues, (short) 1); if (currentParts.size() > 0) { throw new HCatException(ErrorType.ERROR_DUPLICATE_PARTITION); } } } else { List<String> partitionValues = getPartitionValueList(table, outputInfo.getPartitionValues()); // non-partitioned table Path tablePath = new Path(table.getTTable().getSd().getLocation()); FileSystem fs = tablePath.getFileSystem(context.getConfiguration()); if (fs.exists(tablePath)) { FileStatus[] status = fs.globStatus(new Path(tablePath, "*"), hiddenFileFilter); if (status.length > 0) { throw new HCatException(ErrorType.ERROR_NON_EMPTY_TABLE, table.getDbName() + "." + table.getTableName()); } } } }
From source file:org.apache.hive.hcatalog.mapreduce.FileOutputCommitterContainer.java
License:Apache License
/** * Run to discover dynamic partitions available *//* w ww . j av a 2 s. c om*/ private void discoverPartitions(JobContext context) throws IOException { if (!partitionsDiscovered) { // LOG.info("discover ptns called"); OutputJobInfo jobInfo = HCatOutputFormat.getJobInfo(context.getConfiguration()); harProcessor.setEnabled(jobInfo.getHarRequested()); List<Integer> dynamicPartCols = jobInfo.getPosOfDynPartCols(); int maxDynamicPartitions = jobInfo.getMaxDynamicPartitions(); Path loadPath = new Path(jobInfo.getLocation()); FileSystem fs = loadPath.getFileSystem(context.getConfiguration()); // construct a path pattern (e.g., /*/*) to find all dynamically generated paths String dynPathSpec = loadPath.toUri().getPath(); dynPathSpec = dynPathSpec.replaceAll("__HIVE_DEFAULT_PARTITION__", "*"); // LOG.info("Searching for "+dynPathSpec); Path pathPattern = new Path(dynPathSpec); FileStatus[] status = fs.globStatus(pathPattern, FileUtils.HIDDEN_FILES_PATH_FILTER); partitionsDiscoveredByPath = new LinkedHashMap<String, Map<String, String>>(); contextDiscoveredByPath = new LinkedHashMap<String, JobContext>(); if (status.length == 0) { // LOG.warn("No partition found genereated by dynamic partitioning in [" // +loadPath+"] with depth["+jobInfo.getTable().getPartitionKeysSize() // +"], dynSpec["+dynPathSpec+"]"); } else { if ((maxDynamicPartitions != -1) && (status.length > maxDynamicPartitions)) { this.partitionsDiscovered = true; throw new HCatException(ErrorType.ERROR_TOO_MANY_DYNAMIC_PTNS, "Number of dynamic partitions being created " + "exceeds configured max allowable partitions[" + maxDynamicPartitions + "], increase parameter [" + HiveConf.ConfVars.DYNAMICPARTITIONMAXPARTS.varname + "] if needed."); } for (FileStatus st : status) { LinkedHashMap<String, String> fullPartSpec = new LinkedHashMap<String, String>(); if (!customDynamicLocationUsed) { Warehouse.makeSpecFromName(fullPartSpec, st.getPath()); } else { HCatFileUtil.getPartKeyValuesForCustomLocation(fullPartSpec, jobInfo, st.getPath().toString()); } partitionsDiscoveredByPath.put(st.getPath().toString(), fullPartSpec); JobConf jobConf = (JobConf) context.getConfiguration(); JobContext currContext = HCatMapRedUtil.createJobContext(jobConf, context.getJobID(), InternalUtil.createReporter(HCatMapRedUtil.createTaskAttemptContext(jobConf, ShimLoader.getHadoopShims().getHCatShim().createTaskAttemptID()))); HCatOutputFormat.configureOutputStorageHandler(currContext, jobInfo, fullPartSpec); contextDiscoveredByPath.put(st.getPath().toString(), currContext); } } // for (Entry<String,Map<String,String>> spec : partitionsDiscoveredByPath.entrySet()){ // LOG.info("Partition "+ spec.getKey()); // for (Entry<String,String> e : spec.getValue().entrySet()){ // LOG.info(e.getKey() + "=>" +e.getValue()); // } // } this.partitionsDiscovered = true; } }
From source file:org.apache.mahout.classifier.sequencelearning.baumwelchmapreduce.BaumWelchUtils.java
License:Apache License
public static HmmModel CreateHmmModel(int nrOfHiddenStates, int nrOfOutputStates, Path modelPath, Configuration conf) throws IOException { log.info("Entering Create Hmm Model. Model Path = {}", modelPath.toUri()); Vector initialProbabilities = new DenseVector(nrOfHiddenStates); Matrix transitionMatrix = new DenseMatrix(nrOfHiddenStates, nrOfHiddenStates); Matrix emissionMatrix = new DenseMatrix(nrOfHiddenStates, nrOfOutputStates); // Get the path location where the seq files encoding model are stored Path modelFilesPath = new Path(modelPath, "*"); log.info("Create Hmm Model. ModelFiles Path = {}", modelFilesPath.toUri()); Collection<Path> result = new ArrayList<Path>(); // get all filtered file names in result list FileSystem fs = modelFilesPath.getFileSystem(conf); log.info("Create Hmm Model. File System = {}", fs); FileStatus[] matches = fs.listStatus( FileUtil.stat2Paths(fs.globStatus(modelFilesPath, PathFilters.partFilter())), PathFilters.partFilter());//from w ww . ja va 2 s . com for (FileStatus match : matches) { log.info("CreateHmmmModel Adding File Match {}", match.getPath().toString()); result.add(fs.makeQualified(match.getPath())); } // iterate through the result path list for (Path path : result) { for (Pair<Writable, MapWritable> pair : new SequenceFileIterable<Writable, MapWritable>(path, true, conf)) { Text key = (Text) pair.getFirst(); log.info("CreateHmmModel Matching Seq File Key = {}", key); MapWritable valueMap = pair.getSecond(); if (key.charAt(0) == 'I') { // initial distribution stripe for (MapWritable.Entry<Writable, Writable> entry : valueMap.entrySet()) { log.info("CreateHmmModel Initial Prob Adding Key, Value = ({} {})", ((IntWritable) entry.getKey()).get(), ((DoubleWritable) entry.getValue()).get()); initialProbabilities.set(((IntWritable) entry.getKey()).get(), ((DoubleWritable) entry.getValue()).get()); } } else if (key.charAt(0) == 'T') { // transition distribution stripe // key is of the form TRANSIT_0, TRANSIT_1 etc // the number after _ is the state ID at char number 11 int stateID = Character.getNumericValue(key.charAt(8)); log.info("CreateHmmModel stateID = key.charAt(8) = {}", stateID); for (MapWritable.Entry<Writable, Writable> entry : valueMap.entrySet()) { log.info("CreateHmmModel Transition Matrix ({}, {}) = {}", new Object[] { stateID, ((IntWritable) entry.getKey()).get(), ((DoubleWritable) entry.getValue()).get() }); transitionMatrix.set(stateID, ((IntWritable) entry.getKey()).get(), ((DoubleWritable) entry.getValue()).get()); } } else if (key.charAt(0) == 'E') { // emission distribution stripe // key is of the form EMIT_0, EMIT_1 etc // the number after _ is the state ID at char number 5 int stateID = Character.getNumericValue(key.charAt(5)); for (MapWritable.Entry<Writable, Writable> entry : valueMap.entrySet()) { log.info("CreateHmmModel Emission Matrix ({}, {}) = {}", new Object[] { stateID, ((IntWritable) entry.getKey()).get(), ((DoubleWritable) entry.getValue()).get() }); emissionMatrix.set(stateID, ((IntWritable) entry.getKey()).get(), ((DoubleWritable) entry.getValue()).get()); } } else { throw new IllegalStateException("Error creating HmmModel from Sequence File Path"); } } } HmmModel model = new HmmModel(transitionMatrix, emissionMatrix, initialProbabilities); HmmUtils.validate(model); return model; }
From source file:org.apache.mahout.classifier.sequencelearning.hmm.hadoop.BaumWelchUtils.java
License:Apache License
/** * Converts the sequence files present in a directory to a {@link HmmModel} model. * * @param nrOfHiddenStates Number of hidden states * @param nrOfOutputStates Number of output states * @param modelPath Location of the sequence files containing the model's distributions * @param conf Configuration object * @return HmmModel the encoded model// ww w . j av a 2s .c om * @throws IOException */ public static HmmModel createHmmModel(int nrOfHiddenStates, int nrOfOutputStates, Path modelPath, Configuration conf) throws IOException { log.info("Entering Create Hmm Model. Model Path = {}", modelPath.toUri()); Vector initialProbabilities = new DenseVector(nrOfHiddenStates); Matrix transitionMatrix = new DenseMatrix(nrOfHiddenStates, nrOfHiddenStates); Matrix emissionMatrix = new DenseMatrix(nrOfHiddenStates, nrOfOutputStates); // Get the path location where the seq files encoding model are stored Path modelFilesPath = new Path(modelPath, "*"); Collection<Path> result = new ArrayList<Path>(); // get all filtered file names in result list FileSystem fs = modelFilesPath.getFileSystem(conf); FileStatus[] matches = fs.listStatus( FileUtil.stat2Paths(fs.globStatus(modelFilesPath, PathFilters.partFilter())), PathFilters.partFilter()); for (FileStatus match : matches) { result.add(fs.makeQualified(match.getPath())); } // iterate through the result path list for (Path path : result) { for (Pair<Writable, MapWritable> pair : new SequenceFileIterable<Writable, MapWritable>(path, true, conf)) { Text key = (Text) pair.getFirst(); MapWritable valueMap = pair.getSecond(); if (key.charAt(0) == (int) 'I') { // initial distribution stripe for (MapWritable.Entry<Writable, Writable> entry : valueMap.entrySet()) { initialProbabilities.set(((IntWritable) entry.getKey()).get(), ((DoubleWritable) entry.getValue()).get()); } } else if (key.charAt(0) == (int) 'T') { // transition distribution stripe // key is of the form TRANSIT_0, TRANSIT_1 etc int stateID = Integer.parseInt(key.toString().split("_")[1]); for (MapWritable.Entry<Writable, Writable> entry : valueMap.entrySet()) { transitionMatrix.set(stateID, ((IntWritable) entry.getKey()).get(), ((DoubleWritable) entry.getValue()).get()); } } else if (key.charAt(0) == (int) 'E') { // emission distribution stripe // key is of the form EMIT_0, EMIT_1 etc int stateID = Integer.parseInt(key.toString().split("_")[1]); for (MapWritable.Entry<Writable, Writable> entry : valueMap.entrySet()) { emissionMatrix.set(stateID, ((IntWritable) entry.getKey()).get(), ((DoubleWritable) entry.getValue()).get()); } } else { throw new IllegalStateException("Error creating HmmModel from Sequence File Path"); } } } HmmModel model = new HmmModel(transitionMatrix, emissionMatrix, initialProbabilities); if (model != null) { return model; } else throw new IOException("Error building model from output location"); }
From source file:org.apache.mahout.clustering.kmeans.EigenSeedGenerator.java
License:Apache License
public static Path buildFromEigens(Configuration conf, Path input, Path output, int k, DistanceMeasure measure) throws IOException { // delete the output directory FileSystem fs = FileSystem.get(output.toUri(), conf); HadoopUtil.delete(conf, output);//from w w w . java2 s . c o m Path outFile = new Path(output, "part-eigenSeed"); boolean newFile = fs.createNewFile(outFile); if (newFile) { Path inputPathPattern; if (fs.getFileStatus(input).isDir()) { inputPathPattern = new Path(input, "*"); } else { inputPathPattern = input; } FileStatus[] inputFiles = fs.globStatus(inputPathPattern, PathFilters.logsCRCFilter()); SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, outFile, Text.class, ClusterWritable.class); Map<Integer, Double> maxEigens = Maps.newHashMapWithExpectedSize(k); // store // max // value // of // each // column Map<Integer, Text> chosenTexts = Maps.newHashMapWithExpectedSize(k); Map<Integer, ClusterWritable> chosenClusters = Maps.newHashMapWithExpectedSize(k); for (FileStatus fileStatus : inputFiles) { if (!fileStatus.isDir()) { for (Pair<Writable, VectorWritable> record : new SequenceFileIterable<Writable, VectorWritable>( fileStatus.getPath(), true, conf)) { Writable key = record.getFirst(); VectorWritable value = record.getSecond(); for (Vector.Element e : value.get().nonZeroes()) { int index = e.index(); double v = Math.abs(e.get()); if (!maxEigens.containsKey(index) || v > maxEigens.get(index)) { maxEigens.put(index, v); Text newText = new Text(key.toString()); chosenTexts.put(index, newText); Kluster newCluster = new Kluster(value.get(), index, measure); newCluster.observe(value.get(), 1); ClusterWritable clusterWritable = new ClusterWritable(); clusterWritable.setValue(newCluster); chosenClusters.put(index, clusterWritable); } } } } } try { for (Integer key : maxEigens.keySet()) { writer.append(chosenTexts.get(key), chosenClusters.get(key)); } log.info("EigenSeedGenerator:: Wrote {} Klusters to {}", chosenTexts.size(), outFile); } finally { Closeables.close(writer, false); } } return outFile; }
From source file:org.apache.mahout.clustering.kmeans.RandomSeedGenerator.java
License:Apache License
public static Path buildRandom(Configuration conf, Path input, Path output, int k, DistanceMeasure measure, Long seed) throws IOException { Preconditions.checkArgument(k > 0, "Must be: k > 0, but k = " + k); // delete the output directory FileSystem fs = FileSystem.get(output.toUri(), conf); HadoopUtil.delete(conf, output);/*from w w w . ja v a 2s . c o m*/ Path outFile = new Path(output, "part-randomSeed"); boolean newFile = fs.createNewFile(outFile); if (newFile) { Path inputPathPattern; if (fs.getFileStatus(input).isDir()) { inputPathPattern = new Path(input, "*"); } else { inputPathPattern = input; } FileStatus[] inputFiles = fs.globStatus(inputPathPattern, PathFilters.logsCRCFilter()); SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, outFile, Text.class, ClusterWritable.class); Random random = (seed != null) ? RandomUtils.getRandom(seed) : RandomUtils.getRandom(); List<Text> chosenTexts = Lists.newArrayListWithCapacity(k); List<ClusterWritable> chosenClusters = Lists.newArrayListWithCapacity(k); int nextClusterId = 0; int index = 0; for (FileStatus fileStatus : inputFiles) { if (fileStatus.isDir()) { continue; } for (Pair<Writable, VectorWritable> record : new SequenceFileIterable<Writable, VectorWritable>( fileStatus.getPath(), true, conf)) { Writable key = record.getFirst(); VectorWritable value = record.getSecond(); Kluster newCluster = new Kluster(value.get(), nextClusterId++, measure); newCluster.observe(value.get(), 1); Text newText = new Text(key.toString()); int currentSize = chosenTexts.size(); if (currentSize < k) { chosenTexts.add(newText); ClusterWritable clusterWritable = new ClusterWritable(); clusterWritable.setValue(newCluster); chosenClusters.add(clusterWritable); } else { int j = random.nextInt(index); if (j < k) { chosenTexts.set(j, newText); ClusterWritable clusterWritable = new ClusterWritable(); clusterWritable.setValue(newCluster); chosenClusters.set(j, clusterWritable); } } index++; } } try { for (int i = 0; i < chosenTexts.size(); i++) { writer.append(chosenTexts.get(i), chosenClusters.get(i)); } log.info("Wrote {} Klusters to {}", k, outFile); } finally { Closeables.close(writer, false); } } return outFile; }
From source file:org.apache.oozie.action.hadoop.FsELFunctions.java
License:Apache License
/** * Return if a path exists./*from w ww .ja v a2 s . c o m*/ * * @param pathUri file system path uri. * @return <code>true</code> if the path exists, <code>false</code> if it does not. * @throws Exception */ public static boolean fs_exists(String pathUri) throws Exception { Path path = new Path(pathUri); FileSystem fs = getFileSystem(path.toUri()); FileStatus[] pathArr; try { pathArr = fs.globStatus(path, new FSPathFilter()); } catch (ReachingGlobMaxException e) { throw new ActionExecutorException(ActionExecutorException.ErrorType.ERROR, "FS013", "too many globbed files/dirs to do FS operation"); } return (pathArr != null && pathArr.length > 0); }
From source file:org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSequenceFileInputFormat.java
License:Apache License
@Override protected List<FileStatus> listStatus(JobContext job) throws IOException { Path[] dirs = FileInputFormat.getInputPaths(job); if (dirs.length == 0) { throw new IOException("No input paths specified in job"); }/*w w w . j a v a 2 s .c o m*/ List<FileStatus> files = new ArrayList<FileStatus>(); for (int i = 0; i < dirs.length; ++i) { Path p = dirs[i]; FileSystem fs = p.getFileSystem(job.getConfiguration()); FileStatus[] matches = fs.globStatus(p, hiddenFileFilter); if (matches == null) { throw new IOException("Input path does not exist: " + p); } else if (matches.length == 0) { throw new IOException("Input Pattern " + p + " matches 0 files"); } else { for (FileStatus globStat : matches) { files.add(globStat); } } } return MapRedUtil.getAllFileRecursively(files, job.getConfiguration()); }
From source file:org.apache.pig.builtin.TrevniStorage.java
License:Apache License
@Override public Schema getAvroSchema(Path p[], final Job job) throws IOException { ArrayList<FileStatus> statusList = new ArrayList<FileStatus>(); FileSystem fs = FileSystem.get(p[0].toUri(), job.getConfiguration()); for (Path temp : p) { for (FileStatus tempf : fs.globStatus(temp, Utils.VISIBLE_FILES)) { statusList.add(tempf);/*w w w.j ava 2s .c om*/ } } FileStatus[] statusArray = (FileStatus[]) statusList.toArray(new FileStatus[statusList.size()]); if (statusArray == null) { throw new IOException("Path " + p.toString() + " does not exist."); } if (statusArray.length == 0) { throw new IOException("No path matches pattern " + p.toString()); } Path filePath = Utils.depthFirstSearchForFile(statusArray, fs); if (filePath == null) { throw new IOException("No path matches pattern " + p.toString()); } AvroColumnReader.Params params = new AvroColumnReader.Params( new HadoopInput(filePath, job.getConfiguration())); AvroColumnReader<GenericData.Record> reader = new AvroColumnReader<GenericData.Record>(params); Schema s = reader.getFileSchema(); reader.close(); return s; }