Example usage for org.apache.hadoop.fs FileSystem globStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem globStatus.

Prototype

public FileStatus[] globStatus(Path pathPattern, PathFilter filter) throws IOException

Source Link

Document

Return an array of FileStatus objects whose path names match pathPattern and is accepted by the user-supplied path filter.

Usage

From source file:org.apache.hama.bsp.FileInputFormat.java

License:Apache License

/**
 * List input directories. Subclasses may override to, e.g., select only files
 * matching a regular expression./*from   www  . ja  v a2 s.  c  o  m*/
 * 
 * @param job the job to list input paths for
 * @return array of FileStatus objects
 * @throws IOException if zero items.
 */
protected FileStatus[] listStatus(BSPJob job) throws IOException {
    Path[] dirs = getInputPaths(job);
    if (dirs.length == 0) {
        throw new IOException("No input paths specified in job");
    }

    List<FileStatus> result = new ArrayList<FileStatus>();
    List<IOException> errors = new ArrayList<IOException>();

    // creates a MultiPathFilter with the hiddenFileFilter and the
    // user provided one (if any).
    List<PathFilter> filters = new ArrayList<PathFilter>();
    filters.add(hiddenFileFilter);
    PathFilter jobFilter = getInputPathFilter(job);
    if (jobFilter != null) {
        filters.add(jobFilter);
    }
    PathFilter inputFilter = new MultiPathFilter(filters);

    for (Path p : dirs) {
        FileSystem fs = p.getFileSystem(job.getConfiguration());

        FileStatus[] matches = null;
        try {
            matches = fs.globStatus(p, inputFilter);
        } catch (Exception e) {
            LOG.info(p + "\n" + e.toString());
        }

        if (matches == null) {
            errors.add(new IOException("Input path does not exist: " + p));
        } else if (matches.length == 0) {
            errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
        } else {
            for (FileStatus globStat : matches) {
                if (globStat.isDir()) {
                    Collections.addAll(result, fs.listStatus(globStat.getPath(), inputFilter));
                } else {
                    result.add(globStat);
                }
            }
        }
    }

    if (!errors.isEmpty()) {
        throw new InvalidInputException(errors);
    }
    LOG.info("Total input paths to process : " + result.size());
    return result.toArray(new FileStatus[result.size()]);
}

From source file:org.apache.hcatalog.mapreduce.FileOutputFormatContainer.java

License:Apache License

/**
 * Handles duplicate publish of partition. Fails if partition already exists.
 * For non partitioned tables, fails if files are present in table directory.
 * For dynamic partitioned publish, does nothing - check would need to be done at recordwriter time
 * @param context the job//from  w w  w. j a  v a 2 s  .c o m
 * @param outputInfo the output info
 * @param client the metastore client
 * @param table the table being written to
 * @throws IOException
 * @throws org.apache.hadoop.hive.metastore.api.MetaException
 * @throws org.apache.thrift.TException
 */
private static void handleDuplicatePublish(JobContext context, OutputJobInfo outputInfo,
        HiveMetaStoreClient client, Table table)
        throws IOException, MetaException, TException, NoSuchObjectException {

    /*
    * For fully specified ptn, follow strict checks for existence of partitions in metadata
    * For unpartitioned tables, follow filechecks
    * For partially specified tables:
    *    This would then need filechecks at the start of a ptn write,
    *    Doing metadata checks can get potentially very expensive (fat conf) if
    *    there are a large number of partitions that match the partial specifications
    */

    if (table.getPartitionKeys().size() > 0) {
        if (!outputInfo.isDynamicPartitioningUsed()) {
            List<String> partitionValues = getPartitionValueList(table, outputInfo.getPartitionValues());
            // fully-specified partition
            List<String> currentParts = client.listPartitionNames(outputInfo.getDatabaseName(),
                    outputInfo.getTableName(), partitionValues, (short) 1);

            if (currentParts.size() > 0) {
                throw new HCatException(ErrorType.ERROR_DUPLICATE_PARTITION);
            }
        }
    } else {
        List<String> partitionValues = getPartitionValueList(table, outputInfo.getPartitionValues());
        // non-partitioned table

        Path tablePath = new Path(table.getTTable().getSd().getLocation());
        FileSystem fs = tablePath.getFileSystem(context.getConfiguration());

        if (fs.exists(tablePath)) {
            FileStatus[] status = fs.globStatus(new Path(tablePath, "*"), hiddenFileFilter);

            if (status.length > 0) {
                throw new HCatException(ErrorType.ERROR_NON_EMPTY_TABLE,
                        table.getDbName() + "." + table.getTableName());
            }
        }
    }
}

From source file:org.apache.hive.hcatalog.mapreduce.FileOutputCommitterContainer.java

License:Apache License

/**
 * Run to discover dynamic partitions available
 *//* w ww .  j av  a 2 s. c om*/
private void discoverPartitions(JobContext context) throws IOException {
    if (!partitionsDiscovered) {
        //      LOG.info("discover ptns called");
        OutputJobInfo jobInfo = HCatOutputFormat.getJobInfo(context.getConfiguration());

        harProcessor.setEnabled(jobInfo.getHarRequested());

        List<Integer> dynamicPartCols = jobInfo.getPosOfDynPartCols();
        int maxDynamicPartitions = jobInfo.getMaxDynamicPartitions();

        Path loadPath = new Path(jobInfo.getLocation());
        FileSystem fs = loadPath.getFileSystem(context.getConfiguration());

        // construct a path pattern (e.g., /*/*) to find all dynamically generated paths
        String dynPathSpec = loadPath.toUri().getPath();
        dynPathSpec = dynPathSpec.replaceAll("__HIVE_DEFAULT_PARTITION__", "*");

        //      LOG.info("Searching for "+dynPathSpec);
        Path pathPattern = new Path(dynPathSpec);
        FileStatus[] status = fs.globStatus(pathPattern, FileUtils.HIDDEN_FILES_PATH_FILTER);

        partitionsDiscoveredByPath = new LinkedHashMap<String, Map<String, String>>();
        contextDiscoveredByPath = new LinkedHashMap<String, JobContext>();

        if (status.length == 0) {
            //        LOG.warn("No partition found genereated by dynamic partitioning in ["
            //            +loadPath+"] with depth["+jobInfo.getTable().getPartitionKeysSize()
            //            +"], dynSpec["+dynPathSpec+"]");
        } else {
            if ((maxDynamicPartitions != -1) && (status.length > maxDynamicPartitions)) {
                this.partitionsDiscovered = true;
                throw new HCatException(ErrorType.ERROR_TOO_MANY_DYNAMIC_PTNS,
                        "Number of dynamic partitions being created "
                                + "exceeds configured max allowable partitions[" + maxDynamicPartitions
                                + "], increase parameter [" + HiveConf.ConfVars.DYNAMICPARTITIONMAXPARTS.varname
                                + "] if needed.");
            }

            for (FileStatus st : status) {
                LinkedHashMap<String, String> fullPartSpec = new LinkedHashMap<String, String>();
                if (!customDynamicLocationUsed) {
                    Warehouse.makeSpecFromName(fullPartSpec, st.getPath());
                } else {
                    HCatFileUtil.getPartKeyValuesForCustomLocation(fullPartSpec, jobInfo,
                            st.getPath().toString());
                }
                partitionsDiscoveredByPath.put(st.getPath().toString(), fullPartSpec);
                JobConf jobConf = (JobConf) context.getConfiguration();
                JobContext currContext = HCatMapRedUtil.createJobContext(jobConf, context.getJobID(),
                        InternalUtil.createReporter(HCatMapRedUtil.createTaskAttemptContext(jobConf,
                                ShimLoader.getHadoopShims().getHCatShim().createTaskAttemptID())));
                HCatOutputFormat.configureOutputStorageHandler(currContext, jobInfo, fullPartSpec);
                contextDiscoveredByPath.put(st.getPath().toString(), currContext);
            }
        }

        //      for (Entry<String,Map<String,String>> spec : partitionsDiscoveredByPath.entrySet()){
        //        LOG.info("Partition "+ spec.getKey());
        //        for (Entry<String,String> e : spec.getValue().entrySet()){
        //          LOG.info(e.getKey() + "=>" +e.getValue());
        //        }
        //      }

        this.partitionsDiscovered = true;
    }
}

From source file:org.apache.mahout.classifier.sequencelearning.baumwelchmapreduce.BaumWelchUtils.java

License:Apache License

public static HmmModel CreateHmmModel(int nrOfHiddenStates, int nrOfOutputStates, Path modelPath,
        Configuration conf) throws IOException {

    log.info("Entering Create Hmm Model. Model Path = {}", modelPath.toUri());
    Vector initialProbabilities = new DenseVector(nrOfHiddenStates);
    Matrix transitionMatrix = new DenseMatrix(nrOfHiddenStates, nrOfHiddenStates);
    Matrix emissionMatrix = new DenseMatrix(nrOfHiddenStates, nrOfOutputStates);

    // Get the path location where the seq files encoding model are stored
    Path modelFilesPath = new Path(modelPath, "*");
    log.info("Create Hmm Model. ModelFiles Path = {}", modelFilesPath.toUri());
    Collection<Path> result = new ArrayList<Path>();

    // get all filtered file names in result list
    FileSystem fs = modelFilesPath.getFileSystem(conf);
    log.info("Create Hmm Model. File System = {}", fs);
    FileStatus[] matches = fs.listStatus(
            FileUtil.stat2Paths(fs.globStatus(modelFilesPath, PathFilters.partFilter())),
            PathFilters.partFilter());//from  w ww  .  ja va  2  s .  com

    for (FileStatus match : matches) {
        log.info("CreateHmmmModel Adding File Match {}", match.getPath().toString());
        result.add(fs.makeQualified(match.getPath()));
    }

    // iterate through the result path list
    for (Path path : result) {
        for (Pair<Writable, MapWritable> pair : new SequenceFileIterable<Writable, MapWritable>(path, true,
                conf)) {
            Text key = (Text) pair.getFirst();
            log.info("CreateHmmModel Matching Seq File Key = {}", key);
            MapWritable valueMap = pair.getSecond();
            if (key.charAt(0) == 'I') {
                // initial distribution stripe
                for (MapWritable.Entry<Writable, Writable> entry : valueMap.entrySet()) {
                    log.info("CreateHmmModel Initial Prob Adding  Key, Value  = ({} {})",
                            ((IntWritable) entry.getKey()).get(), ((DoubleWritable) entry.getValue()).get());
                    initialProbabilities.set(((IntWritable) entry.getKey()).get(),
                            ((DoubleWritable) entry.getValue()).get());
                }
            } else if (key.charAt(0) == 'T') {
                // transition distribution stripe
                // key is of the form TRANSIT_0, TRANSIT_1 etc
                // the number after _ is the state ID at char number 11
                int stateID = Character.getNumericValue(key.charAt(8));
                log.info("CreateHmmModel stateID = key.charAt(8) = {}", stateID);
                for (MapWritable.Entry<Writable, Writable> entry : valueMap.entrySet()) {
                    log.info("CreateHmmModel Transition Matrix ({}, {}) = {}", new Object[] { stateID,
                            ((IntWritable) entry.getKey()).get(), ((DoubleWritable) entry.getValue()).get() });
                    transitionMatrix.set(stateID, ((IntWritable) entry.getKey()).get(),
                            ((DoubleWritable) entry.getValue()).get());
                }
            } else if (key.charAt(0) == 'E') {
                // emission distribution stripe
                // key is of the form EMIT_0, EMIT_1 etc
                // the number after _ is the state ID at char number 5
                int stateID = Character.getNumericValue(key.charAt(5));
                for (MapWritable.Entry<Writable, Writable> entry : valueMap.entrySet()) {
                    log.info("CreateHmmModel Emission Matrix ({}, {}) = {}", new Object[] { stateID,
                            ((IntWritable) entry.getKey()).get(), ((DoubleWritable) entry.getValue()).get() });
                    emissionMatrix.set(stateID, ((IntWritable) entry.getKey()).get(),
                            ((DoubleWritable) entry.getValue()).get());
                }
            } else {
                throw new IllegalStateException("Error creating HmmModel from Sequence File Path");
            }
        }
    }
    HmmModel model = new HmmModel(transitionMatrix, emissionMatrix, initialProbabilities);
    HmmUtils.validate(model);
    return model;
}

From source file:org.apache.mahout.classifier.sequencelearning.hmm.hadoop.BaumWelchUtils.java

License:Apache License

/**
 * Converts the sequence files present in a directory to a {@link HmmModel} model.
 *
 * @param nrOfHiddenStates Number of hidden states
 * @param nrOfOutputStates Number of output states
 * @param modelPath        Location of the sequence files containing the model's distributions
 * @param conf             Configuration object
 * @return HmmModel the encoded model// ww w . j  av a 2s .c  om
 * @throws IOException
 */
public static HmmModel createHmmModel(int nrOfHiddenStates, int nrOfOutputStates, Path modelPath,
        Configuration conf) throws IOException {

    log.info("Entering Create Hmm Model. Model Path = {}", modelPath.toUri());
    Vector initialProbabilities = new DenseVector(nrOfHiddenStates);
    Matrix transitionMatrix = new DenseMatrix(nrOfHiddenStates, nrOfHiddenStates);
    Matrix emissionMatrix = new DenseMatrix(nrOfHiddenStates, nrOfOutputStates);

    // Get the path location where the seq files encoding model are stored
    Path modelFilesPath = new Path(modelPath, "*");

    Collection<Path> result = new ArrayList<Path>();

    // get all filtered file names in result list
    FileSystem fs = modelFilesPath.getFileSystem(conf);
    FileStatus[] matches = fs.listStatus(
            FileUtil.stat2Paths(fs.globStatus(modelFilesPath, PathFilters.partFilter())),
            PathFilters.partFilter());

    for (FileStatus match : matches) {
        result.add(fs.makeQualified(match.getPath()));
    }

    // iterate through the result path list
    for (Path path : result) {
        for (Pair<Writable, MapWritable> pair : new SequenceFileIterable<Writable, MapWritable>(path, true,
                conf)) {
            Text key = (Text) pair.getFirst();
            MapWritable valueMap = pair.getSecond();
            if (key.charAt(0) == (int) 'I') {
                // initial distribution stripe
                for (MapWritable.Entry<Writable, Writable> entry : valueMap.entrySet()) {
                    initialProbabilities.set(((IntWritable) entry.getKey()).get(),
                            ((DoubleWritable) entry.getValue()).get());
                }
            } else if (key.charAt(0) == (int) 'T') {
                // transition distribution stripe
                // key is of the form TRANSIT_0, TRANSIT_1 etc
                int stateID = Integer.parseInt(key.toString().split("_")[1]);
                for (MapWritable.Entry<Writable, Writable> entry : valueMap.entrySet()) {
                    transitionMatrix.set(stateID, ((IntWritable) entry.getKey()).get(),
                            ((DoubleWritable) entry.getValue()).get());
                }
            } else if (key.charAt(0) == (int) 'E') {
                // emission distribution stripe
                // key is of the form EMIT_0, EMIT_1 etc
                int stateID = Integer.parseInt(key.toString().split("_")[1]);
                for (MapWritable.Entry<Writable, Writable> entry : valueMap.entrySet()) {
                    emissionMatrix.set(stateID, ((IntWritable) entry.getKey()).get(),
                            ((DoubleWritable) entry.getValue()).get());
                }
            } else {
                throw new IllegalStateException("Error creating HmmModel from Sequence File Path");
            }
        }
    }

    HmmModel model = new HmmModel(transitionMatrix, emissionMatrix, initialProbabilities);

    if (model != null) {
        return model;
    } else
        throw new IOException("Error building model from output location");

}

From source file:org.apache.mahout.clustering.kmeans.EigenSeedGenerator.java

License:Apache License

public static Path buildFromEigens(Configuration conf, Path input, Path output, int k, DistanceMeasure measure)
        throws IOException {
    // delete the output directory
    FileSystem fs = FileSystem.get(output.toUri(), conf);
    HadoopUtil.delete(conf, output);//from  w  w w  . java2  s . c o m
    Path outFile = new Path(output, "part-eigenSeed");
    boolean newFile = fs.createNewFile(outFile);
    if (newFile) {
        Path inputPathPattern;

        if (fs.getFileStatus(input).isDir()) {
            inputPathPattern = new Path(input, "*");
        } else {
            inputPathPattern = input;
        }

        FileStatus[] inputFiles = fs.globStatus(inputPathPattern, PathFilters.logsCRCFilter());
        SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, outFile, Text.class,
                ClusterWritable.class);
        Map<Integer, Double> maxEigens = Maps.newHashMapWithExpectedSize(k); // store
                                                                             // max
                                                                             // value
                                                                             // of
                                                                             // each
                                                                             // column
        Map<Integer, Text> chosenTexts = Maps.newHashMapWithExpectedSize(k);
        Map<Integer, ClusterWritable> chosenClusters = Maps.newHashMapWithExpectedSize(k);

        for (FileStatus fileStatus : inputFiles) {
            if (!fileStatus.isDir()) {
                for (Pair<Writable, VectorWritable> record : new SequenceFileIterable<Writable, VectorWritable>(
                        fileStatus.getPath(), true, conf)) {
                    Writable key = record.getFirst();
                    VectorWritable value = record.getSecond();

                    for (Vector.Element e : value.get().nonZeroes()) {
                        int index = e.index();
                        double v = Math.abs(e.get());

                        if (!maxEigens.containsKey(index) || v > maxEigens.get(index)) {
                            maxEigens.put(index, v);
                            Text newText = new Text(key.toString());
                            chosenTexts.put(index, newText);
                            Kluster newCluster = new Kluster(value.get(), index, measure);
                            newCluster.observe(value.get(), 1);
                            ClusterWritable clusterWritable = new ClusterWritable();
                            clusterWritable.setValue(newCluster);
                            chosenClusters.put(index, clusterWritable);
                        }
                    }
                }
            }
        }

        try {
            for (Integer key : maxEigens.keySet()) {
                writer.append(chosenTexts.get(key), chosenClusters.get(key));
            }
            log.info("EigenSeedGenerator:: Wrote {} Klusters to {}", chosenTexts.size(), outFile);
        } finally {
            Closeables.close(writer, false);
        }
    }

    return outFile;
}

From source file:org.apache.mahout.clustering.kmeans.RandomSeedGenerator.java

License:Apache License

public static Path buildRandom(Configuration conf, Path input, Path output, int k, DistanceMeasure measure,
        Long seed) throws IOException {

    Preconditions.checkArgument(k > 0, "Must be: k > 0, but k = " + k);
    // delete the output directory
    FileSystem fs = FileSystem.get(output.toUri(), conf);
    HadoopUtil.delete(conf, output);/*from  w w w .  ja  v  a 2s  . c  o m*/
    Path outFile = new Path(output, "part-randomSeed");
    boolean newFile = fs.createNewFile(outFile);
    if (newFile) {
        Path inputPathPattern;

        if (fs.getFileStatus(input).isDir()) {
            inputPathPattern = new Path(input, "*");
        } else {
            inputPathPattern = input;
        }

        FileStatus[] inputFiles = fs.globStatus(inputPathPattern, PathFilters.logsCRCFilter());
        SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, outFile, Text.class,
                ClusterWritable.class);

        Random random = (seed != null) ? RandomUtils.getRandom(seed) : RandomUtils.getRandom();

        List<Text> chosenTexts = Lists.newArrayListWithCapacity(k);
        List<ClusterWritable> chosenClusters = Lists.newArrayListWithCapacity(k);
        int nextClusterId = 0;

        int index = 0;
        for (FileStatus fileStatus : inputFiles) {
            if (fileStatus.isDir()) {
                continue;
            }
            for (Pair<Writable, VectorWritable> record : new SequenceFileIterable<Writable, VectorWritable>(
                    fileStatus.getPath(), true, conf)) {
                Writable key = record.getFirst();
                VectorWritable value = record.getSecond();
                Kluster newCluster = new Kluster(value.get(), nextClusterId++, measure);
                newCluster.observe(value.get(), 1);
                Text newText = new Text(key.toString());
                int currentSize = chosenTexts.size();
                if (currentSize < k) {
                    chosenTexts.add(newText);
                    ClusterWritable clusterWritable = new ClusterWritable();
                    clusterWritable.setValue(newCluster);
                    chosenClusters.add(clusterWritable);
                } else {
                    int j = random.nextInt(index);
                    if (j < k) {
                        chosenTexts.set(j, newText);
                        ClusterWritable clusterWritable = new ClusterWritable();
                        clusterWritable.setValue(newCluster);
                        chosenClusters.set(j, clusterWritable);
                    }
                }
                index++;
            }
        }

        try {
            for (int i = 0; i < chosenTexts.size(); i++) {
                writer.append(chosenTexts.get(i), chosenClusters.get(i));
            }
            log.info("Wrote {} Klusters to {}", k, outFile);
        } finally {
            Closeables.close(writer, false);
        }
    }

    return outFile;
}

From source file:org.apache.oozie.action.hadoop.FsELFunctions.java

License:Apache License

/**
 * Return if a path exists./*from w ww .ja v  a2  s  .  c  o  m*/
 *
 * @param pathUri file system path uri.
 * @return <code>true</code> if the path exists, <code>false</code> if it does not.
 * @throws Exception
 */
public static boolean fs_exists(String pathUri) throws Exception {
    Path path = new Path(pathUri);
    FileSystem fs = getFileSystem(path.toUri());
    FileStatus[] pathArr;
    try {
        pathArr = fs.globStatus(path, new FSPathFilter());
    } catch (ReachingGlobMaxException e) {
        throw new ActionExecutorException(ActionExecutorException.ErrorType.ERROR, "FS013",
                "too many globbed files/dirs to do FS operation");
    }
    return (pathArr != null && pathArr.length > 0);
}

From source file:org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSequenceFileInputFormat.java

License:Apache License

@Override
protected List<FileStatus> listStatus(JobContext job) throws IOException {
    Path[] dirs = FileInputFormat.getInputPaths(job);
    if (dirs.length == 0) {
        throw new IOException("No input paths specified in job");
    }/*w w  w . j  a  v  a  2  s  .c o m*/
    List<FileStatus> files = new ArrayList<FileStatus>();
    for (int i = 0; i < dirs.length; ++i) {
        Path p = dirs[i];
        FileSystem fs = p.getFileSystem(job.getConfiguration());
        FileStatus[] matches = fs.globStatus(p, hiddenFileFilter);
        if (matches == null) {
            throw new IOException("Input path does not exist: " + p);
        } else if (matches.length == 0) {
            throw new IOException("Input Pattern " + p + " matches 0 files");
        } else {
            for (FileStatus globStat : matches) {
                files.add(globStat);
            }
        }
    }
    return MapRedUtil.getAllFileRecursively(files, job.getConfiguration());
}

From source file:org.apache.pig.builtin.TrevniStorage.java

License:Apache License

@Override
public Schema getAvroSchema(Path p[], final Job job) throws IOException {

    ArrayList<FileStatus> statusList = new ArrayList<FileStatus>();
    FileSystem fs = FileSystem.get(p[0].toUri(), job.getConfiguration());
    for (Path temp : p) {
        for (FileStatus tempf : fs.globStatus(temp, Utils.VISIBLE_FILES)) {
            statusList.add(tempf);/*w w w.j  ava  2s .c  om*/
        }
    }
    FileStatus[] statusArray = (FileStatus[]) statusList.toArray(new FileStatus[statusList.size()]);

    if (statusArray == null) {
        throw new IOException("Path " + p.toString() + " does not exist.");
    }

    if (statusArray.length == 0) {
        throw new IOException("No path matches pattern " + p.toString());
    }

    Path filePath = Utils.depthFirstSearchForFile(statusArray, fs);

    if (filePath == null) {
        throw new IOException("No path matches pattern " + p.toString());
    }

    AvroColumnReader.Params params = new AvroColumnReader.Params(
            new HadoopInput(filePath, job.getConfiguration()));
    AvroColumnReader<GenericData.Record> reader = new AvroColumnReader<GenericData.Record>(params);
    Schema s = reader.getFileSchema();
    reader.close();
    return s;
}