Example usage for org.apache.hadoop.fs Path getName

Introduction

In this page you can find the example usage for org.apache.hadoop.fs Path getName.

Prototype

public String getName()

Source Link

Document

Returns the final component of this path.

Usage

From source file:com.mozilla.grouperfish.transforms.coclustering.pig.eval.text.ConvertDocumentIDToID.java

License:Apache License

private void loadDocumentIndex(String documentIndexPath) throws IOException {
    if (documentIndex == null) {
        documentIndex = new HashMap<String, Integer>();

        Path p = new Path(documentIndexPath);
        FileSystem fs = FileSystem.get(p.toUri(), new Configuration());
        int index = 0;
        for (FileStatus status : fs.listStatus(p)) {
            Path currPath = status.getPath();
            if (!status.isDir() && !currPath.getName().startsWith("_")) {
                BufferedReader reader = null;
                try {
                    reader = new BufferedReader(new InputStreamReader(fs.open(currPath)));
                    String line = null;
                    while ((line = reader.readLine()) != null) {
                        documentIndex.put(line.trim(), index++);
                    }//from  w  w w. j  a v  a  2 s. c om
                } finally {
                    if (reader != null) {
                        reader.close();
                    }
                }
            }
        }

        log.info("Loaded document index with size: " + documentIndex.size());
    }
}

From source file:com.mozilla.grouperfish.transforms.coclustering.pig.eval.text.ConvertFeatureToID.java

License:Apache License

private void loadFeatureIndex(String featureIndexPath) throws IOException {
    if (featureIndex == null) {
        featureIndex = new HashMap<String, Integer>();

        Path p = new Path(featureIndexPath);
        FileSystem fs = FileSystem.get(p.toUri(), new Configuration());
        int index = 0;
        for (FileStatus status : fs.listStatus(p)) {
            Path currPath = status.getPath();
            if (!status.isDir() && !currPath.getName().startsWith("_")) {
                BufferedReader reader = null;
                try {
                    reader = new BufferedReader(new InputStreamReader(fs.open(currPath)));
                    String line = null;
                    while ((line = reader.readLine()) != null) {
                        featureIndex.put(line.trim(), index++);
                    }//from  w w  w .  j av a  2s  . c  om
                } finally {
                    if (reader != null) {
                        reader.close();
                    }
                }
            }
        }

        log.info("Loaded feature index with size: " + featureIndex.size());
    }
}

From source file:com.mozilla.grouperfish.transforms.coclustering.pig.storage.MahoutVectorStorage.java

License:Apache License

@SuppressWarnings({ "rawtypes", "unchecked", "finally" })
@Override//from  w  w w  .ja  v a  2 s . c o  m
public void prepareToWrite(RecordWriter writer) throws IOException {
    if (dimensionPath != null) {
        Path p = new Path(dimensionPath);
        FileSystem fs = FileSystem.get(p.toUri(), new Configuration());
        for (FileStatus status : fs.listStatus(p)) {
            Path currPath = status.getPath();
            if (!status.isDir() && !currPath.getName().startsWith("_")) {
                BufferedReader reader = null;
                try {
                    reader = new BufferedReader(new InputStreamReader(fs.open(currPath)));
                    String line = reader.readLine();
                    this.dimensions = Integer.parseInt(line);
                } catch (NumberFormatException nfe) {
                    LOG.error("Unexpected input for dimensions", nfe);
                    throw new IOException();
                } finally {
                    if (reader != null) {
                        reader.close();
                    }
                    // TODO: SMELLY: Why loop if we always cancel after the first file?
                    break;
                }
            }
        }
    }
    this.writer = writer;
}

From source file:com.mozilla.hadoop.fs.SequenceFileDirectoryReader.java

License:Apache License

public SequenceFileDirectoryReader(Path inputPath) throws IOException {
    fs = FileSystem.get(inputPath.toUri(), conf);
    paths = new ArrayList<Path>();
    for (FileStatus status : fs.listStatus(inputPath)) {
        Path p = status.getPath();
        if (!status.isDir() && !p.getName().startsWith("_")) {
            paths.add(p);//w w w  .j  a  va  2  s .  c  o m
        }
    }

    pathIter = paths.iterator();
}

From source file:com.mozilla.hadoop.fs.TextFileDirectoryReader.java

License:Apache License

public TextFileDirectoryReader(Path inputPath) throws IOException {
    fs = FileSystem.get(inputPath.toUri(), conf);
    paths = new ArrayList<Path>();
    for (FileStatus status : fs.listStatus(inputPath)) {
        Path p = status.getPath();
        if (!status.isDir() && !p.getName().startsWith("_")) {
            paths.add(p);//w ww  . jav  a  2  s  .  c  om
        }
    }

    pathIter = paths.iterator();
}

From source file:com.mvad.flink.demo.streaming.lib.sink.bucketing.BucketingSink.java

License:Apache License

/**
 * Opens a new part file./*from   w w w .j av  a2s.c  om*/
 *
 * <p>
 * This closes the old bucket file and retrieves a new bucket path from the {@code Bucketer}.
 */
private void openNewPartFile(Path bucketPath, BucketState<T> bucketState) throws Exception {
    closeCurrentPartFile(bucketState);

    FileSystem fs = new Path(basePath).getFileSystem(hadoopConf);

    if (!fs.exists(bucketPath)) {
        try {
            if (fs.mkdirs(bucketPath)) {
                LOG.debug("Created new bucket directory: {}", bucketPath);
            }
        } catch (IOException e) {
            throw new RuntimeException("Could not create new bucket path.", e);
        }
    }

    Path partPath = new Path(bucketPath,
            partPrefix + "-" + subtaskIndex + "-" + bucketState.partCounter + partSuffix);

    // This should work since there is only one parallel subtask that tries names with
    // our subtask id. Otherwise we would run into concurrency issues here.
    while (fs.exists(partPath) || fs
            .exists(new Path(partPath.getParent(), pendingPrefix + partPath.getName()).suffix(pendingSuffix))) {
        bucketState.partCounter++;
        partPath = new Path(bucketPath,
                partPrefix + "-" + subtaskIndex + "-" + bucketState.partCounter + partSuffix);
    }

    // increase, so we don't have to check for this name next time
    bucketState.partCounter++;

    LOG.debug("Next part path is {}", partPath.toString());
    bucketState.currentFile = partPath.toString();

    Path inProgressPath = new Path(partPath.getParent(), inProgressPrefix + partPath.getName())
            .suffix(inProgressSuffix);

    // If we don't already have a writer for this bucket, create one
    if (bucketState.writer == null) {
        bucketState.writer = writerTemplate.duplicate();
    }

    bucketState.writer.open(fs, inProgressPath);
    bucketState.isWriterOpen = true;
}

From source file:com.mvad.flink.demo.streaming.lib.sink.bucketing.BucketingSink.java

License:Apache License

/**
 * Closes the current part file./*w w  w .j a v  a 2  s. c o  m*/
 *
 * <p>
 * This moves the current in-progress part file to a pending file and adds it to the list
 * of pending files in our bucket state.
 */
private void closeCurrentPartFile(BucketState<T> bucketState) throws Exception {
    if (bucketState.isWriterOpen) {
        bucketState.writer.close();
        bucketState.isWriterOpen = false;
    }

    if (bucketState.currentFile != null) {
        Path currentPartPath = new Path(bucketState.currentFile);
        Path inProgressPath = new Path(currentPartPath.getParent(),
                inProgressPrefix + currentPartPath.getName()).suffix(inProgressSuffix);
        Path pendingPath = new Path(currentPartPath.getParent(), pendingPrefix + currentPartPath.getName())
                .suffix(pendingSuffix);
        FileSystem fs = inProgressPath.getFileSystem(hadoopConf);
        fs.rename(inProgressPath, pendingPath);
        LOG.debug("Moving in-progress bucket {} to pending file {}", inProgressPath, pendingPath);
        bucketState.pendingFiles.add(currentPartPath.toString());
        bucketState.currentFile = null;
    }
}

From source file:com.mvad.flink.demo.streaming.lib.sink.bucketing.BucketingSink.java

License:Apache License

@Override
public void notifyCheckpointComplete(long checkpointId) throws Exception {
    synchronized (state.bucketStates) {
        Iterator<Map.Entry<String, BucketState<T>>> it = state.bucketStates.entrySet().iterator();
        while (it.hasNext()) {
            BucketState<T> bucketState = it.next().getValue();
            synchronized (bucketState.pendingFilesPerCheckpoint) {
                Set<Long> pastCheckpointIds = bucketState.pendingFilesPerCheckpoint.keySet();
                Set<Long> checkpointsToRemove = new HashSet<>();
                for (Long pastCheckpointId : pastCheckpointIds) {
                    if (pastCheckpointId <= checkpointId) {
                        LOG.debug("Moving pending files to final location for checkpoint {}", pastCheckpointId);
                        // All the pending files are buckets that have been completed but are waiting to be renamed
                        // to their final name
                        for (String filename : bucketState.pendingFilesPerCheckpoint.get(pastCheckpointId)) {
                            Path finalPath = new Path(filename);
                            Path pendingPath = new Path(finalPath.getParent(),
                                    pendingPrefix + finalPath.getName()).suffix(pendingSuffix);

                            FileSystem fs = pendingPath.getFileSystem(hadoopConf);
                            fs.rename(pendingPath, finalPath);
                            LOG.debug(//from w w  w.  j a  v  a2s . c  o m
                                    "Moving pending file {} to final location having completed checkpoint {}.",
                                    pendingPath, pastCheckpointId);
                        }
                        checkpointsToRemove.add(pastCheckpointId);
                    }
                }
                if (!bucketState.isWriterOpen && bucketState.pendingFiles.isEmpty()) {
                    // We've dealt with all the pending files and the writer for this bucket is not currently open.
                    // Therefore this bucket is currently inactive and we can remove it from our state.
                    it.remove();
                } else {
                    for (Long toRemove : checkpointsToRemove) {
                        bucketState.pendingFilesPerCheckpoint.remove(toRemove);
                    }
                }
            }
        }
    }
}

From source file:com.mvad.flink.demo.streaming.lib.sink.bucketing.BucketingSink.java

License:Apache License

@Override
public void restoreState(State<T> state) {
    this.state = state;

    FileSystem fs;/*from  w w  w.j  a  v  a 2  s.c om*/
    try {
        fs = new Path(basePath).getFileSystem(HadoopFileSystem.getHadoopConfiguration());
    } catch (IOException e) {
        LOG.error("Error while creating FileSystem in checkpoint restore.", e);
        throw new RuntimeException("Error while creating FileSystem in checkpoint restore.", e);
    }

    for (BucketState<T> bucketState : state.bucketStates.values()) {
        // we can clean all the pending files since they where renamed to final files
        // after this checkpoint was successful
        bucketState.pendingFiles.clear();

        if (bucketState.currentFile != null) {
            // We were writing to a file when the last checkpoint occured. This file can either
            // be still in-progress or became a pending file at some point after the checkpoint.
            // Either way, we have to truncate it back to a valid state (or write a .valid-length)
            // file that specifies up to which length it is valid and rename it to the final name
            // before starting a new bucket file.
            Path partPath = new Path(bucketState.currentFile);
            try {
                Path partPendingPath = new Path(partPath.getParent(), pendingPrefix + partPath.getName())
                        .suffix(pendingSuffix);
                Path partInProgressPath = new Path(partPath.getParent(), inProgressPrefix + partPath.getName())
                        .suffix(inProgressSuffix);

                if (fs.exists(partPendingPath)) {
                    LOG.debug(
                            "In-progress file {} has been moved to pending after checkpoint, moving to final location.",
                            partPath);
                    // has been moved to pending in the mean time, rename to final location
                    fs.rename(partPendingPath, partPath);
                } else if (fs.exists(partInProgressPath)) {
                    LOG.debug("In-progress file {} is still in-progress, moving to final location.", partPath);
                    // it was still in progress, rename to final path
                    fs.rename(partInProgressPath, partPath);
                } else if (fs.exists(partPath)) {
                    LOG.debug("In-Progress file {} was already moved to final location {}.",
                            bucketState.currentFile, partPath);
                } else {
                    LOG.debug(
                            "In-Progress file {} was neither moved to pending nor is still in progress. Possibly, "
                                    + "it was moved to final location by a previous snapshot restore",
                            bucketState.currentFile);
                }

                refTruncate = reflectTruncate(fs);
                // truncate it or write a ".valid-length" file to specify up to which point it is valid
                if (refTruncate != null) {
                    LOG.debug("Truncating {} to valid length {}", partPath, bucketState.currentFileValidLength);
                    // some-one else might still hold the lease from a previous try, we are
                    // recovering, after all ...
                    if (fs instanceof DistributedFileSystem) {
                        DistributedFileSystem dfs = (DistributedFileSystem) fs;
                        LOG.debug("Trying to recover file lease {}", partPath);
                        dfs.recoverLease(partPath);
                        boolean isclosed = dfs.isFileClosed(partPath);
                        StopWatch sw = new StopWatch();
                        sw.start();
                        while (!isclosed) {
                            if (sw.getTime() > asyncTimeout) {
                                break;
                            }
                            try {
                                Thread.sleep(500);
                            } catch (InterruptedException e1) {
                                // ignore it
                            }
                            isclosed = dfs.isFileClosed(partPath);
                        }
                    }
                    Boolean truncated = (Boolean) refTruncate.invoke(fs, partPath,
                            bucketState.currentFileValidLength);
                    if (!truncated) {
                        LOG.debug("Truncate did not immediately complete for {}, waiting...", partPath);

                        // we must wait for the asynchronous truncate operation to complete
                        StopWatch sw = new StopWatch();
                        sw.start();
                        long newLen = fs.getFileStatus(partPath).getLen();
                        while (newLen != bucketState.currentFileValidLength) {
                            if (sw.getTime() > asyncTimeout) {
                                break;
                            }
                            try {
                                Thread.sleep(500);
                            } catch (InterruptedException e1) {
                                // ignore it
                            }
                            newLen = fs.getFileStatus(partPath).getLen();
                        }
                        if (newLen != bucketState.currentFileValidLength) {
                            throw new RuntimeException("Truncate did not truncate to right length. Should be "
                                    + bucketState.currentFileValidLength + " is " + newLen + ".");
                        }
                    }

                } else {
                    LOG.debug("Writing valid-length file for {} to specify valid length {}", partPath,
                            bucketState.currentFileValidLength);
                    Path validLengthFilePath = new Path(partPath.getParent(),
                            validLengthPrefix + partPath.getName()).suffix(validLengthSuffix);
                    if (!fs.exists(validLengthFilePath)) {
                        FSDataOutputStream lengthFileOut = fs.create(validLengthFilePath);
                        lengthFileOut.writeUTF(Long.toString(bucketState.currentFileValidLength));
                        lengthFileOut.close();
                    }
                }

                // Now that we've restored the bucket to a valid state, reset the current file info
                bucketState.currentFile = null;
                bucketState.currentFileValidLength = -1;
            } catch (IOException e) {
                LOG.error("Error while restoring BucketingSink state.", e);
                throw new RuntimeException("Error while restoring BucketingSink state.", e);
            } catch (InvocationTargetException | IllegalAccessException e) {
                LOG.error("Cound not invoke truncate.", e);
                throw new RuntimeException("Could not invoke truncate.", e);
            }
        }

        LOG.debug("Clearing pending/in-progress files.");

        // Move files that are confirmed by a checkpoint but did not get moved to final location
        // because the checkpoint notification did not happen before a failure

        Set<Long> pastCheckpointIds = bucketState.pendingFilesPerCheckpoint.keySet();
        LOG.debug("Moving pending files to final location on restore.");
        for (Long pastCheckpointId : pastCheckpointIds) {
            // All the pending files are buckets that have been completed but are waiting to be renamed
            // to their final name
            for (String filename : bucketState.pendingFilesPerCheckpoint.get(pastCheckpointId)) {
                Path finalPath = new Path(filename);
                Path pendingPath = new Path(finalPath.getParent(), pendingPrefix + finalPath.getName())
                        .suffix(pendingSuffix);

                try {
                    if (fs.exists(pendingPath)) {
                        LOG.debug(
                                "(RESTORE) Moving pending file {} to final location after complete checkpoint {}.",
                                pendingPath, pastCheckpointId);
                        fs.rename(pendingPath, finalPath);
                    }
                } catch (IOException e) {
                    LOG.error("(RESTORE) Error while renaming pending file {} to final path {}: {}",
                            pendingPath, finalPath, e);
                    throw new RuntimeException(
                            "Error while renaming pending file " + pendingPath + " to final path " + finalPath,
                            e);
                }
            }
        }

        synchronized (bucketState.pendingFilesPerCheckpoint) {
            bucketState.pendingFilesPerCheckpoint.clear();
        }
    }

    // we need to get this here since open() has not yet been called
    int subtaskIndex = getRuntimeContext().getIndexOfThisSubtask();
    // delete pending files
    try {

        RemoteIterator<LocatedFileStatus> bucketFiles = fs.listFiles(new Path(basePath), true);

        while (bucketFiles.hasNext()) {
            LocatedFileStatus file = bucketFiles.next();
            if (file.getPath().toString().endsWith(pendingSuffix)) {
                // only delete files that contain our subtask index
                if (file.getPath().toString().contains(partPrefix + "-" + subtaskIndex + "-")) {
                    LOG.debug("(RESTORE) Deleting pending file {}", file.getPath().toString());
                    fs.delete(file.getPath(), true);
                }
            }
            if (file.getPath().toString().endsWith(inProgressSuffix)) {
                // only delete files that contain our subtask index
                if (file.getPath().toString().contains(partPrefix + "-" + subtaskIndex + "-")) {
                    LOG.debug("(RESTORE) Deleting in-progress file {}", file.getPath().toString());
                    fs.delete(file.getPath(), true);
                }
            }
        }
    } catch (IOException e) {
        LOG.error("Error while deleting old pending files: {}", e);
        throw new RuntimeException("Error while deleting old pending files.", e);
    }
}

From source file:com.mvdb.platform.action.VersionMerge.java

License:Apache License

public static void main(String[] args) throws Exception {
    logger.error("error1");
    logger.warn("warning1");
    logger.info("info1");
    logger.debug("debug1");
    logger.trace("trace1");
    ActionUtils.setUpInitFileProperty();
    //        LoggerContext lc = (LoggerContext) LoggerFactory.getILoggerFactory();
    //        StatusPrinter.print(lc);

    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    //Also add  lastMergedTimeStamp and  mergeUptoTimestamp and passive db name which would be mv1 or mv2
    if (otherArgs.length != 3) {
        System.err.println("Usage: versionmerge <customer-directory>");
        System.exit(2);//from w w w. j a  v a2s  .  c  o m
    }
    //Example: file:/home/umesh/.mvdb/etl/data/alpha
    //Example: hdfs://localhost:9000/data/alpha
    String customerDirectory = otherArgs[0];
    String lastMergedDirName = otherArgs[1];
    String lastCopiedDirName = otherArgs[2];

    org.apache.hadoop.conf.Configuration conf1 = new org.apache.hadoop.conf.Configuration();
    //conf1.addResource(new Path("/home/umesh/ops/hadoop-1.2.0/conf/core-site.xml"));
    FileSystem hdfsFileSystem = FileSystem.get(conf1);

    Path topPath = new Path(customerDirectory);

    //Clean scratch db
    Path passiveDbPath = new Path(topPath, "db/mv1");
    Path tempDbPath = new Path(topPath, "db/tmp-" + (int) (Math.random() * 100000));
    if (hdfsFileSystem.exists(tempDbPath)) {
        boolean success = hdfsFileSystem.delete(tempDbPath, true);
        if (success == false) {
            System.err.println(String.format("Unable to delete temp directory %s", tempDbPath.toString()));
            System.exit(1);
        }
    }
    //last three parameters are hardcoded and  the nulls must be replaced later after changing inout parameters. 
    Path[] inputPaths = getInputPaths(hdfsFileSystem, topPath, lastMergedDirName, lastCopiedDirName, null);
    Set<String> tableNameSet = new HashSet<String>();
    for (Path path : inputPaths) {
        tableNameSet.add(path.getName());
    }

    Job job = new Job(conf, "versionmerge");
    job.setJarByClass(VersionMerge.class);
    job.setMapperClass(VersionMergeMapper.class);
    job.setReducerClass(VersionMergeReducer.class);
    job.setMapOutputKeyClass(MergeKey.class);
    job.setMapOutputValueClass(BytesWritable.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(BytesWritable.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    String lastDirName = null;
    if (inputPaths != null && inputPaths.length > 1) {
        lastDirName = inputPaths[(inputPaths.length) - 2].getParent().getName();
    }
    for (Path inputPath : inputPaths) {
        FileInputFormat.addInputPath(job, inputPath);
    }
    FileOutputFormat.setOutputPath(job, tempDbPath);

    for (String table : tableNameSet) {
        if (table.endsWith(".dat") == false) {
            continue;
        }
        table = table.replaceAll("-", "");
        table = table.replaceAll(".dat", "");
        MultipleOutputs.addNamedOutput(job, table, SequenceFileOutputFormat.class, Text.class,
                BytesWritable.class);
    }
    boolean success = job.waitForCompletion(true);
    System.out.println("Success:" + success);
    System.out.println(ManagementFactory.getRuntimeMXBean().getName());
    if (success && lastDirName != null) {
        ActionUtils.setConfigurationValue(new Path(customerDirectory).getName(),
                ConfigurationKeys.LAST_MERGE_TO_MVDB_DIRNAME, lastDirName);
    }
    //hdfsFileSystem.delete(passiveDbPath, true);
    //hdfsFileSystem.rename(tempDbPath, passiveDbPath);
    System.exit(success ? 0 : 1);
}