Example usage for org.apache.hadoop.fs FileSystem globStatus

List of usage examples for org.apache.hadoop.fs FileSystem globStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem globStatus.

Prototype

public FileStatus[] globStatus(Path pathPattern, PathFilter filter) throws IOException 

Source Link

Document

Return an array of FileStatus objects whose path names match pathPattern and is accepted by the user-supplied path filter.

Usage

From source file:org.apache.hama.bsp.FileInputFormat.java

License:Apache License

/**
 * List input directories. Subclasses may override to, e.g., select only files
 * matching a regular expression./*from   www  . ja  v a2 s.  c  o  m*/
 * 
 * @param job the job to list input paths for
 * @return array of FileStatus objects
 * @throws IOException if zero items.
 */
protected FileStatus[] listStatus(BSPJob job) throws IOException {
    Path[] dirs = getInputPaths(job);
    if (dirs.length == 0) {
        throw new IOException("No input paths specified in job");
    }

    List<FileStatus> result = new ArrayList<FileStatus>();
    List<IOException> errors = new ArrayList<IOException>();

    // creates a MultiPathFilter with the hiddenFileFilter and the
    // user provided one (if any).
    List<PathFilter> filters = new ArrayList<PathFilter>();
    filters.add(hiddenFileFilter);
    PathFilter jobFilter = getInputPathFilter(job);
    if (jobFilter != null) {
        filters.add(jobFilter);
    }
    PathFilter inputFilter = new MultiPathFilter(filters);

    for (Path p : dirs) {
        FileSystem fs = p.getFileSystem(job.getConfiguration());

        FileStatus[] matches = null;
        try {
            matches = fs.globStatus(p, inputFilter);
        } catch (Exception e) {
            LOG.info(p + "\n" + e.toString());
        }

        if (matches == null) {
            errors.add(new IOException("Input path does not exist: " + p));
        } else if (matches.length == 0) {
            errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
        } else {
            for (FileStatus globStat : matches) {
                if (globStat.isDir()) {
                    Collections.addAll(result, fs.listStatus(globStat.getPath(), inputFilter));
                } else {
                    result.add(globStat);
                }
            }
        }
    }

    if (!errors.isEmpty()) {
        throw new InvalidInputException(errors);
    }
    LOG.info("Total input paths to process : " + result.size());
    return result.toArray(new FileStatus[result.size()]);
}

From source file:org.apache.hcatalog.mapreduce.FileOutputFormatContainer.java

License:Apache License

/**
 * Handles duplicate publish of partition. Fails if partition already exists.
 * For non partitioned tables, fails if files are present in table directory.
 * For dynamic partitioned publish, does nothing - check would need to be done at recordwriter time
 * @param context the job//from  w w  w. j a  v a 2 s  .c o m
 * @param outputInfo the output info
 * @param client the metastore client
 * @param table the table being written to
 * @throws IOException
 * @throws org.apache.hadoop.hive.metastore.api.MetaException
 * @throws org.apache.thrift.TException
 */
private static void handleDuplicatePublish(JobContext context, OutputJobInfo outputInfo,
        HiveMetaStoreClient client, Table table)
        throws IOException, MetaException, TException, NoSuchObjectException {

    /*
    * For fully specified ptn, follow strict checks for existence of partitions in metadata
    * For unpartitioned tables, follow filechecks
    * For partially specified tables:
    *    This would then need filechecks at the start of a ptn write,
    *    Doing metadata checks can get potentially very expensive (fat conf) if
    *    there are a large number of partitions that match the partial specifications
    */

    if (table.getPartitionKeys().size() > 0) {
        if (!outputInfo.isDynamicPartitioningUsed()) {
            List<String> partitionValues = getPartitionValueList(table, outputInfo.getPartitionValues());
            // fully-specified partition
            List<String> currentParts = client.listPartitionNames(outputInfo.getDatabaseName(),
                    outputInfo.getTableName(), partitionValues, (short) 1);

            if (currentParts.size() > 0) {
                throw new HCatException(ErrorType.ERROR_DUPLICATE_PARTITION);
            }
        }
    } else {
        List<String> partitionValues = getPartitionValueList(table, outputInfo.getPartitionValues());
        // non-partitioned table

        Path tablePath = new Path(table.getTTable().getSd().getLocation());
        FileSystem fs = tablePath.getFileSystem(context.getConfiguration());

        if (fs.exists(tablePath)) {
            FileStatus[] status = fs.globStatus(new Path(tablePath, "*"), hiddenFileFilter);

            if (status.length > 0) {
                throw new HCatException(ErrorType.ERROR_NON_EMPTY_TABLE,
                        table.getDbName() + "." + table.getTableName());
            }
        }
    }
}

From source file:org.apache.hive.hcatalog.mapreduce.FileOutputCommitterContainer.java

License:Apache License

/**
 * Run to discover dynamic partitions available
 *//* w ww .  j av  a 2 s. c om*/
private void discoverPartitions(JobContext context) throws IOException {
    if (!partitionsDiscovered) {
        //      LOG.info("discover ptns called");
        OutputJobInfo jobInfo = HCatOutputFormat.getJobInfo(context.getConfiguration());

        harProcessor.setEnabled(jobInfo.getHarRequested());

        List<Integer> dynamicPartCols = jobInfo.getPosOfDynPartCols();
        int maxDynamicPartitions = jobInfo.getMaxDynamicPartitions();

        Path loadPath = new Path(jobInfo.getLocation());
        FileSystem fs = loadPath.getFileSystem(context.getConfiguration());

        // construct a path pattern (e.g., /*/*) to find all dynamically generated paths
        String dynPathSpec = loadPath.toUri().getPath();
        dynPathSpec = dynPathSpec.replaceAll("__HIVE_DEFAULT_PARTITION__", "*");

        //      LOG.info("Searching for "+dynPathSpec);
        Path pathPattern = new Path(dynPathSpec);
        FileStatus[] status = fs.globStatus(pathPattern, FileUtils.HIDDEN_FILES_PATH_FILTER);

        partitionsDiscoveredByPath = new LinkedHashMap<String, Map<String, String>>();
        contextDiscoveredByPath = new LinkedHashMap<String, JobContext>();

        if (status.length == 0) {
            //        LOG.warn("No partition found genereated by dynamic partitioning in ["
            //            +loadPath+"] with depth["+jobInfo.getTable().getPartitionKeysSize()
            //            +"], dynSpec["+dynPathSpec+"]");
        } else {
            if ((maxDynamicPartitions != -1) && (status.length > maxDynamicPartitions)) {
                this.partitionsDiscovered = true;
                throw new HCatException(ErrorType.ERROR_TOO_MANY_DYNAMIC_PTNS,
                        "Number of dynamic partitions being created "
                                + "exceeds configured max allowable partitions[" + maxDynamicPartitions
                                + "], increase parameter [" + HiveConf.ConfVars.DYNAMICPARTITIONMAXPARTS.varname
                                + "] if needed.");
            }

            for (FileStatus st : status) {
                LinkedHashMap<String, String> fullPartSpec = new LinkedHashMap<String, String>();
                if (!customDynamicLocationUsed) {
                    Warehouse.makeSpecFromName(fullPartSpec, st.getPath());
                } else {
                    HCatFileUtil.getPartKeyValuesForCustomLocation(fullPartSpec, jobInfo,
                            st.getPath().toString());
                }
                partitionsDiscoveredByPath.put(st.getPath().toString(), fullPartSpec);
                JobConf jobConf = (JobConf) context.getConfiguration();
                JobContext currContext = HCatMapRedUtil.createJobContext(jobConf, context.getJobID(),
                        InternalUtil.createReporter(HCatMapRedUtil.createTaskAttemptContext(jobConf,
                                ShimLoader.getHadoopShims().getHCatShim().createTaskAttemptID())));
                HCatOutputFormat.configureOutputStorageHandler(currContext, jobInfo, fullPartSpec);
                contextDiscoveredByPath.put(st.getPath().toString(), currContext);
            }
        }

        //      for (Entry<String,Map<String,String>> spec : partitionsDiscoveredByPath.entrySet()){
        //        LOG.info("Partition "+ spec.getKey());
        //        for (Entry<String,String> e : spec.getValue().entrySet()){
        //          LOG.info(e.getKey() + "=>" +e.getValue());
        //        }
        //      }

        this.partitionsDiscovered = true;
    }
}

From source file:org.apache.mahout.classifier.sequencelearning.baumwelchmapreduce.BaumWelchUtils.java

License:Apache License

public static HmmModel CreateHmmModel(int nrOfHiddenStates, int nrOfOutputStates, Path modelPath,
        Configuration conf) throws IOException {

    log.info("Entering Create Hmm Model. Model Path = {}", modelPath.toUri());
    Vector initialProbabilities = new DenseVector(nrOfHiddenStates);
    Matrix transitionMatrix = new DenseMatrix(nrOfHiddenStates, nrOfHiddenStates);
    Matrix emissionMatrix = new DenseMatrix(nrOfHiddenStates, nrOfOutputStates);

    // Get the path location where the seq files encoding model are stored
    Path modelFilesPath = new Path(modelPath, "*");
    log.info("Create Hmm Model. ModelFiles Path = {}", modelFilesPath.toUri());
    Collection<Path> result = new ArrayList<Path>();

    // get all filtered file names in result list
    FileSystem fs = modelFilesPath.getFileSystem(conf);
    log.info("Create Hmm Model. File System = {}", fs);
    FileStatus[] matches = fs.listStatus(
            FileUtil.stat2Paths(fs.globStatus(modelFilesPath, PathFilters.partFilter())),
            PathFilters.partFilter());//from  w ww  .  ja va  2  s .  com

    for (FileStatus match : matches) {
        log.info("CreateHmmmModel Adding File Match {}", match.getPath().toString());
        result.add(fs.makeQualified(match.getPath()));
    }

    // iterate through the result path list
    for (Path path : result) {
        for (Pair<Writable, MapWritable> pair : new SequenceFileIterable<Writable, MapWritable>(path, true,
                conf)) {
            Text key = (Text) pair.getFirst();
            log.info("CreateHmmModel Matching Seq File Key = {}", key);
            MapWritable valueMap = pair.getSecond();
            if (key.charAt(0) == 'I') {
                // initial distribution stripe
                for (MapWritable.Entry<Writable, Writable> entry : valueMap.entrySet()) {
                    log.info("CreateHmmModel Initial Prob Adding  Key, Value  = ({} {})",
                            ((IntWritable) entry.getKey()).get(), ((DoubleWritable) entry.getValue()).get());
                    initialProbabilities.set(((IntWritable) entry.getKey()).get(),
                            ((DoubleWritable) entry.getValue()).get());
                }
            } else if (key.charAt(0) == 'T') {
                // transition distribution stripe
                // key is of the form TRANSIT_0, TRANSIT_1 etc
                // the number after _ is the state ID at char number 11
                int stateID = Character.getNumericValue(key.charAt(8));
                log.info("CreateHmmModel stateID = key.charAt(8) = {}", stateID);
                for (MapWritable.Entry<Writable, Writable> entry : valueMap.entrySet()) {
                    log.info("CreateHmmModel Transition Matrix ({}, {}) = {}", new Object[] { stateID,
                            ((IntWritable) entry.getKey()).get(), ((DoubleWritable) entry.getValue()).get() });
                    transitionMatrix.set(stateID, ((IntWritable) entry.getKey()).get(),
                            ((DoubleWritable) entry.getValue()).get());
                }
            } else if (key.charAt(0) == 'E') {
                // emission distribution stripe
                // key is of the form EMIT_0, EMIT_1 etc
                // the number after _ is the state ID at char number 5
                int stateID = Character.getNumericValue(key.charAt(5));
                for (MapWritable.Entry<Writable, Writable> entry : valueMap.entrySet()) {
                    log.info("CreateHmmModel Emission Matrix ({}, {}) = {}", new Object[] { stateID,
                            ((IntWritable) entry.getKey()).get(), ((DoubleWritable) entry.getValue()).get() });
                    emissionMatrix.set(stateID, ((IntWritable) entry.getKey()).get(),
                            ((DoubleWritable) entry.getValue()).get());
                }
            } else {
                throw new IllegalStateException("Error creating HmmModel from Sequence File Path");
            }
        }
    }
    HmmModel model = new HmmModel(transitionMatrix, emissionMatrix, initialProbabilities);
    HmmUtils.validate(model);
    return model;
}

From source file:org.apache.mahout.classifier.sequencelearning.hmm.hadoop.BaumWelchUtils.java

License:Apache License

/**
 * Converts the sequence files present in a directory to a {@link HmmModel} model.
 *
 * @param nrOfHiddenStates Number of hidden states
 * @param nrOfOutputStates Number of output states
 * @param modelPath        Location of the sequence files containing the model's distributions
 * @param conf             Configuration object
 * @return HmmModel the encoded model// ww w . j  av a 2s .c  om
 * @throws IOException
 */
public static HmmModel createHmmModel(int nrOfHiddenStates, int nrOfOutputStates, Path modelPath,
        Configuration conf) throws IOException {

    log.info("Entering Create Hmm Model. Model Path = {}", modelPath.toUri());
    Vector initialProbabilities = new DenseVector(nrOfHiddenStates);
    Matrix transitionMatrix = new DenseMatrix(nrOfHiddenStates, nrOfHiddenStates);
    Matrix emissionMatrix = new DenseMatrix(nrOfHiddenStates, nrOfOutputStates);

    // Get the path location where the seq files encoding model are stored
    Path modelFilesPath = new Path(modelPath, "*");

    Collection<Path> result = new ArrayList<Path>();

    // get all filtered file names in result list
    FileSystem fs = modelFilesPath.getFileSystem(conf);
    FileStatus[] matches = fs.listStatus(
            FileUtil.stat2Paths(fs.globStatus(modelFilesPath, PathFilters.partFilter())),
            PathFilters.partFilter());

    for (FileStatus match : matches) {
        result.add(fs.makeQualified(match.getPath()));
    }

    // iterate through the result path list
    for (Path path : result) {
        for (Pair<Writable, MapWritable> pair : new SequenceFileIterable<Writable, MapWritable>(path, true,
                conf)) {
            Text key = (Text) pair.getFirst();
            MapWritable valueMap = pair.getSecond();
            if (key.charAt(0) == (int) 'I') {
                // initial distribution stripe
                for (MapWritable.Entry<Writable, Writable> entry : valueMap.entrySet()) {
                    initialProbabilities.set(((IntWritable) entry.getKey()).get(),
                            ((DoubleWritable) entry.getValue()).get());
                }
            } else if (key.charAt(0) == (int) 'T') {
                // transition distribution stripe
                // key is of the form TRANSIT_0, TRANSIT_1 etc
                int stateID = Integer.parseInt(key.toString().split("_")[1]);
                for (MapWritable.Entry<Writable, Writable> entry : valueMap.entrySet()) {
                    transitionMatrix.set(stateID, ((IntWritable) entry.getKey()).get(),
                            ((DoubleWritable) entry.getValue()).get());
                }
            } else if (key.charAt(0) == (int) 'E') {
                // emission distribution stripe
                // key is of the form EMIT_0, EMIT_1 etc
                int stateID = Integer.parseInt(key.toString().split("_")[1]);
                for (MapWritable.Entry<Writable, Writable> entry : valueMap.entrySet()) {
                    emissionMatrix.set(stateID, ((IntWritable) entry.getKey()).get(),
                            ((DoubleWritable) entry.getValue()).get());
                }
            } else {
                throw new IllegalStateException("Error creating HmmModel from Sequence File Path");
            }
        }
    }

    HmmModel model = new HmmModel(transitionMatrix, emissionMatrix, initialProbabilities);

    if (model != null) {
        return model;
    } else
        throw new IOException("Error building model from output location");

}

From source file:org.apache.mahout.clustering.kmeans.EigenSeedGenerator.java

License:Apache License

public static Path buildFromEigens(Configuration conf, Path input, Path output, int k, DistanceMeasure measure)
        throws IOException {
    // delete the output directory
    FileSystem fs = FileSystem.get(output.toUri(), conf);
    HadoopUtil.delete(conf, output);//from  w  w w  . java2  s . c o m
    Path outFile = new Path(output, "part-eigenSeed");
    boolean newFile = fs.createNewFile(outFile);
    if (newFile) {
        Path inputPathPattern;

        if (fs.getFileStatus(input).isDir()) {
            inputPathPattern = new Path(input, "*");
        } else {
            inputPathPattern = input;
        }

        FileStatus[] inputFiles = fs.globStatus(inputPathPattern, PathFilters.logsCRCFilter());
        SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, outFile, Text.class,
                ClusterWritable.class);
        Map<Integer, Double> maxEigens = Maps.newHashMapWithExpectedSize(k); // store
                                                                             // max
                                                                             // value
                                                                             // of
                                                                             // each
                                                                             // column
        Map<Integer, Text> chosenTexts = Maps.newHashMapWithExpectedSize(k);
        Map<Integer, ClusterWritable> chosenClusters = Maps.newHashMapWithExpectedSize(k);

        for (FileStatus fileStatus : inputFiles) {
            if (!fileStatus.isDir()) {
                for (Pair<Writable, VectorWritable> record : new SequenceFileIterable<Writable, VectorWritable>(
                        fileStatus.getPath(), true, conf)) {
                    Writable key = record.getFirst();
                    VectorWritable value = record.getSecond();

                    for (Vector.Element e : value.get().nonZeroes()) {
                        int index = e.index();
                        double v = Math.abs(e.get());

                        if (!maxEigens.containsKey(index) || v > maxEigens.get(index)) {
                            maxEigens.put(index, v);
                            Text newText = new Text(key.toString());
                            chosenTexts.put(index, newText);
                            Kluster newCluster = new Kluster(value.get(), index, measure);
                            newCluster.observe(value.get(), 1);
                            ClusterWritable clusterWritable = new ClusterWritable();
                            clusterWritable.setValue(newCluster);
                            chosenClusters.put(index, clusterWritable);
                        }
                    }
                }
            }
        }

        try {
            for (Integer key : maxEigens.keySet()) {
                writer.append(chosenTexts.get(key), chosenClusters.get(key));
            }
            log.info("EigenSeedGenerator:: Wrote {} Klusters to {}", chosenTexts.size(), outFile);
        } finally {
            Closeables.close(writer, false);
        }
    }

    return outFile;
}

From source file:org.apache.mahout.clustering.kmeans.RandomSeedGenerator.java

License:Apache License

public static Path buildRandom(Configuration conf, Path input, Path output, int k, DistanceMeasure measure,
        Long seed) throws IOException {

    Preconditions.checkArgument(k > 0, "Must be: k > 0, but k = " + k);
    // delete the output directory
    FileSystem fs = FileSystem.get(output.toUri(), conf);
    HadoopUtil.delete(conf, output);/*from  w w w .  ja  v  a 2s  . c  o m*/
    Path outFile = new Path(output, "part-randomSeed");
    boolean newFile = fs.createNewFile(outFile);
    if (newFile) {
        Path inputPathPattern;

        if (fs.getFileStatus(input).isDir()) {
            inputPathPattern = new Path(input, "*");
        } else {
            inputPathPattern = input;
        }

        FileStatus[] inputFiles = fs.globStatus(inputPathPattern, PathFilters.logsCRCFilter());
        SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, outFile, Text.class,
                ClusterWritable.class);

        Random random = (seed != null) ? RandomUtils.getRandom(seed) : RandomUtils.getRandom();

        List<Text> chosenTexts = Lists.newArrayListWithCapacity(k);
        List<ClusterWritable> chosenClusters = Lists.newArrayListWithCapacity(k);
        int nextClusterId = 0;

        int index = 0;
        for (FileStatus fileStatus : inputFiles) {
            if (fileStatus.isDir()) {
                continue;
            }
            for (Pair<Writable, VectorWritable> record : new SequenceFileIterable<Writable, VectorWritable>(
                    fileStatus.getPath(), true, conf)) {
                Writable key = record.getFirst();
                VectorWritable value = record.getSecond();
                Kluster newCluster = new Kluster(value.get(), nextClusterId++, measure);
                newCluster.observe(value.get(), 1);
                Text newText = new Text(key.toString());
                int currentSize = chosenTexts.size();
                if (currentSize < k) {
                    chosenTexts.add(newText);
                    ClusterWritable clusterWritable = new ClusterWritable();
                    clusterWritable.setValue(newCluster);
                    chosenClusters.add(clusterWritable);
                } else {
                    int j = random.nextInt(index);
                    if (j < k) {
                        chosenTexts.set(j, newText);
                        ClusterWritable clusterWritable = new ClusterWritable();
                        clusterWritable.setValue(newCluster);
                        chosenClusters.set(j, clusterWritable);
                    }
                }
                index++;
            }
        }

        try {
            for (int i = 0; i < chosenTexts.size(); i++) {
                writer.append(chosenTexts.get(i), chosenClusters.get(i));
            }
            log.info("Wrote {} Klusters to {}", k, outFile);
        } finally {
            Closeables.close(writer, false);
        }
    }

    return outFile;
}

From source file:org.apache.oozie.action.hadoop.FsELFunctions.java

License:Apache License

/**
 * Return if a path exists./*from w ww .ja v  a2  s  .  c  o  m*/
 *
 * @param pathUri file system path uri.
 * @return <code>true</code> if the path exists, <code>false</code> if it does not.
 * @throws Exception
 */
public static boolean fs_exists(String pathUri) throws Exception {
    Path path = new Path(pathUri);
    FileSystem fs = getFileSystem(path.toUri());
    FileStatus[] pathArr;
    try {
        pathArr = fs.globStatus(path, new FSPathFilter());
    } catch (ReachingGlobMaxException e) {
        throw new ActionExecutorException(ActionExecutorException.ErrorType.ERROR, "FS013",
                "too many globbed files/dirs to do FS operation");
    }
    return (pathArr != null && pathArr.length > 0);
}

From source file:org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSequenceFileInputFormat.java

License:Apache License

@Override
protected List<FileStatus> listStatus(JobContext job) throws IOException {
    Path[] dirs = FileInputFormat.getInputPaths(job);
    if (dirs.length == 0) {
        throw new IOException("No input paths specified in job");
    }/*w w  w . j  a  v  a  2  s  .c o m*/
    List<FileStatus> files = new ArrayList<FileStatus>();
    for (int i = 0; i < dirs.length; ++i) {
        Path p = dirs[i];
        FileSystem fs = p.getFileSystem(job.getConfiguration());
        FileStatus[] matches = fs.globStatus(p, hiddenFileFilter);
        if (matches == null) {
            throw new IOException("Input path does not exist: " + p);
        } else if (matches.length == 0) {
            throw new IOException("Input Pattern " + p + " matches 0 files");
        } else {
            for (FileStatus globStat : matches) {
                files.add(globStat);
            }
        }
    }
    return MapRedUtil.getAllFileRecursively(files, job.getConfiguration());
}

From source file:org.apache.pig.builtin.TrevniStorage.java

License:Apache License

@Override
public Schema getAvroSchema(Path p[], final Job job) throws IOException {

    ArrayList<FileStatus> statusList = new ArrayList<FileStatus>();
    FileSystem fs = FileSystem.get(p[0].toUri(), job.getConfiguration());
    for (Path temp : p) {
        for (FileStatus tempf : fs.globStatus(temp, Utils.VISIBLE_FILES)) {
            statusList.add(tempf);/*w w w.j  ava  2s .c  om*/
        }
    }
    FileStatus[] statusArray = (FileStatus[]) statusList.toArray(new FileStatus[statusList.size()]);

    if (statusArray == null) {
        throw new IOException("Path " + p.toString() + " does not exist.");
    }

    if (statusArray.length == 0) {
        throw new IOException("No path matches pattern " + p.toString());
    }

    Path filePath = Utils.depthFirstSearchForFile(statusArray, fs);

    if (filePath == null) {
        throw new IOException("No path matches pattern " + p.toString());
    }

    AvroColumnReader.Params params = new AvroColumnReader.Params(
            new HadoopInput(filePath, job.getConfiguration()));
    AvroColumnReader<GenericData.Record> reader = new AvroColumnReader<GenericData.Record>(params);
    Schema s = reader.getFileSchema();
    reader.close();
    return s;
}