Example usage for org.apache.hadoop.fs FileSystem rename

List of usage examples for org.apache.hadoop.fs FileSystem rename

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem rename.

Prototype

public abstract boolean rename(Path src, Path dst) throws IOException;

Source Link

Document

Renames Path src to Path dst.

Usage

From source file:org.apache.mahout.math.hadoop.stochasticsvd.SSVDCli.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    addInputOption();//from w w  w .  j a  v a 2s. c om
    addOutputOption();
    addOption("rank", "k", "decomposition rank", true);
    addOption("oversampling", "p", "oversampling", String.valueOf(15));
    addOption("blockHeight", "r", "Y block height (must be > (k+p))", String.valueOf(10000));
    addOption("outerProdBlockHeight", "oh",
            "block height of outer products during multiplication, increase for sparse inputs",
            String.valueOf(30000));
    addOption("abtBlockHeight", "abth",
            "block height of Y_i in ABtJob during AB' multiplication, increase for extremely sparse inputs",
            String.valueOf(200000));
    addOption("minSplitSize", "s", "minimum split size", String.valueOf(-1));
    addOption("computeU", "U", "compute U (true/false)", String.valueOf(true));
    addOption("uHalfSigma", "uhs", "Compute U * Sigma^0.5", String.valueOf(false));
    addOption("uSigma", "us", "Compute U * Sigma", String.valueOf(false));
    addOption("computeV", "V", "compute V (true/false)", String.valueOf(true));
    addOption("vHalfSigma", "vhs", "compute V * Sigma^0.5", String.valueOf(false));
    addOption("reduceTasks", "t", "number of reduce tasks (where applicable)", true);
    addOption("powerIter", "q", "number of additional power iterations (0..2 is good)", String.valueOf(0));
    addOption("broadcast", "br", "whether use distributed cache to broadcast matrices wherever possible",
            String.valueOf(true));
    addOption("pca", "pca", "run in pca mode: compute column-wise mean and subtract from input",
            String.valueOf(false));
    addOption("pcaOffset", "xi", "path(glob) of external pca mean (optional, dont compute, use external mean");
    addOption(DefaultOptionCreator.overwriteOption().create());

    Map<String, List<String>> pargs = parseArguments(args);
    if (pargs == null) {
        return -1;
    }

    int k = Integer.parseInt(getOption("rank"));
    int p = Integer.parseInt(getOption("oversampling"));
    int r = Integer.parseInt(getOption("blockHeight"));
    int h = Integer.parseInt(getOption("outerProdBlockHeight"));
    int abh = Integer.parseInt(getOption("abtBlockHeight"));
    int q = Integer.parseInt(getOption("powerIter"));
    int minSplitSize = Integer.parseInt(getOption("minSplitSize"));
    boolean computeU = Boolean.parseBoolean(getOption("computeU"));
    boolean computeV = Boolean.parseBoolean(getOption("computeV"));
    boolean cUHalfSigma = Boolean.parseBoolean(getOption("uHalfSigma"));
    boolean cUSigma = Boolean.parseBoolean(getOption("uSigma"));
    boolean cVHalfSigma = Boolean.parseBoolean(getOption("vHalfSigma"));
    int reduceTasks = Integer.parseInt(getOption("reduceTasks"));
    boolean broadcast = Boolean.parseBoolean(getOption("broadcast"));
    String xiPathStr = getOption("pcaOffset");
    Path xiPath = xiPathStr == null ? null : new Path(xiPathStr);
    boolean pca = Boolean.parseBoolean(getOption("pca")) || xiPath != null;

    boolean overwrite = hasOption(DefaultOptionCreator.OVERWRITE_OPTION);

    Configuration conf = getConf();
    if (conf == null) {
        throw new IOException("No Hadoop configuration present");
    }

    Path[] inputPaths = { getInputPath() };
    Path tempPath = getTempPath();
    FileSystem fs = FileSystem.get(getTempPath().toUri(), conf);

    // housekeeping
    if (overwrite) {
        // clear the output path
        HadoopUtil.delete(getConf(), getOutputPath());
        // clear the temp path
        HadoopUtil.delete(getConf(), getTempPath());
    }

    fs.mkdirs(getOutputPath());

    // MAHOUT-817
    if (pca && xiPath == null) {
        xiPath = new Path(tempPath, "xi");
        if (overwrite) {
            fs.delete(xiPath, true);
        }
        MatrixColumnMeansJob.run(conf, inputPaths[0], xiPath);
    }

    SSVDSolver solver = new SSVDSolver(conf, inputPaths, new Path(tempPath, "ssvd"), r, k, p, reduceTasks);

    solver.setMinSplitSize(minSplitSize);
    solver.setComputeU(computeU);
    solver.setComputeV(computeV);
    solver.setcUHalfSigma(cUHalfSigma);
    solver.setcVHalfSigma(cVHalfSigma);
    solver.setcUSigma(cUSigma);
    solver.setOuterBlockHeight(h);
    solver.setAbtBlockHeight(abh);
    solver.setQ(q);
    solver.setBroadcast(broadcast);
    solver.setOverwrite(overwrite);

    if (xiPath != null) {
        solver.setPcaMeanPath(new Path(xiPath, "part-*"));
    }

    solver.run();

    Vector svalues = solver.getSingularValues().viewPart(0, k);
    SSVDHelper.saveVector(svalues, getOutputPath("sigma"), conf);

    if (computeU && !fs.rename(new Path(solver.getUPath()), getOutputPath())) {
        throw new IOException("Unable to move U results to the output path.");
    }
    if (cUHalfSigma && !fs.rename(new Path(solver.getuHalfSigmaPath()), getOutputPath())) {
        throw new IOException("Unable to move U*Sigma^0.5 results to the output path.");
    }
    if (cUSigma && !fs.rename(new Path(solver.getuSigmaPath()), getOutputPath())) {
        throw new IOException("Unable to move U*Sigma results to the output path.");
    }
    if (computeV && !fs.rename(new Path(solver.getVPath()), getOutputPath())) {
        throw new IOException("Unable to move V results to the output path.");
    }
    if (cVHalfSigma && !fs.rename(new Path(solver.getvHalfSigmaPath()), getOutputPath())) {
        throw new IOException("Unable to move V*Sigma^0.5 results to the output path.");
    }

    // Delete the temp path on exit
    fs.deleteOnExit(getTempPath());

    return 0;
}

From source file:org.apache.mahout.utils.vectors.text.DictionaryVectorizer.java

License:Apache License

/**
 * Create Term Frequency (Tf) Vectors from the input set of documents in {@link SequenceFile} format. This
 * tries to fix the maximum memory used by the feature chunk per node thereby splitting the process across
 * multiple map/reduces./*  w  ww  .  j  a  v  a2  s . c  o  m*/
 * 
 * @param input
 *          input directory of the documents in {@link SequenceFile} format
 * @param output
 *          output directory where {@link org.apache.mahout.math.RandomAccessSparseVector}'s of the document
 *          are generated
 * @param minSupport
 *          the minimum frequency of the feature in the entire corpus to be considered for inclusion in the
 *          sparse vector
 * @param maxNGramSize
 *          1 = unigram, 2 = unigram and bigram, 3 = unigram, bigram and trigram
 * @param minLLRValue
 *          minValue of log likelihood ratio to used to prune ngrams
 * @param chunkSizeInMegabytes
 *          the size in MB of the feature => id chunk to be kept in memory at each node during Map/Reduce
 *          stage. Its recommended you calculated this based on the number of cores and the free memory
 *          available to you per node. Say, you have 2 cores and around 1GB extra memory to spare we
 *          recommend you use a split size of around 400-500MB so that two simultaneous reducers can create
 *          partial vectors without thrashing the system due to increased swapping
 * @throws IOException
 * @throws ClassNotFoundException 
 * @throws InterruptedException 
 */
public static void createTermFrequencyVectors(Path input, Path output, Configuration baseConf, int minSupport,
        int maxNGramSize, float minLLRValue, int numReducers, int chunkSizeInMegabytes,
        boolean sequentialAccess) throws IOException, InterruptedException, ClassNotFoundException {
    if (chunkSizeInMegabytes < MIN_CHUNKSIZE) {
        chunkSizeInMegabytes = MIN_CHUNKSIZE;
    } else if (chunkSizeInMegabytes > MAX_CHUNKSIZE) { // 10GB
        chunkSizeInMegabytes = MAX_CHUNKSIZE;
    }
    if (minSupport < 0) {
        minSupport = DEFAULT_MIN_SUPPORT;
    }

    Path dictionaryJobPath = new Path(output, DICTIONARY_JOB_FOLDER);

    int[] maxTermDimension = new int[1];
    List<Path> dictionaryChunks;
    if (maxNGramSize == 1) {
        startWordCounting(input, dictionaryJobPath, minSupport);
        dictionaryChunks = createDictionaryChunks(minSupport, dictionaryJobPath, output, chunkSizeInMegabytes,
                new LongWritable(), maxTermDimension);
    } else {
        CollocDriver.generateAllGrams(input, dictionaryJobPath, baseConf, maxNGramSize, minSupport, minLLRValue,
                numReducers);
        dictionaryChunks = createDictionaryChunks(minSupport,
                new Path(new Path(output, DICTIONARY_JOB_FOLDER), CollocDriver.NGRAM_OUTPUT_DIRECTORY), output,
                chunkSizeInMegabytes, new DoubleWritable(), maxTermDimension);
    }

    int partialVectorIndex = 0;
    List<Path> partialVectorPaths = new ArrayList<Path>();
    for (Path dictionaryChunk : dictionaryChunks) {
        Path partialVectorOutputPath = new Path(output, VECTOR_OUTPUT_FOLDER + partialVectorIndex++);
        partialVectorPaths.add(partialVectorOutputPath);
        makePartialVectors(input, maxNGramSize, dictionaryChunk, partialVectorOutputPath, maxTermDimension[0],
                sequentialAccess, numReducers);
    }

    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(partialVectorPaths.get(0).toUri(), conf);

    Path outputDir = new Path(output, DOCUMENT_VECTOR_OUTPUT_FOLDER);
    if (dictionaryChunks.size() > 1) {
        PartialVectorMerger.mergePartialVectors(partialVectorPaths, outputDir, -1, maxTermDimension[0],
                sequentialAccess, numReducers);
        HadoopUtil.deletePaths(partialVectorPaths, fs);
    } else {
        Path singlePartialVectorOutputPath = partialVectorPaths.get(0);
        fs.delete(outputDir, true);
        fs.rename(singlePartialVectorOutputPath, outputDir);
    }
}

From source file:org.apache.mahout.utils.vectors.tfidf.TFIDFConverter.java

License:Apache License

/**
 * Create Term Frequency-Inverse Document Frequency (Tf-Idf) Vectors from the input set of vectors in
 * {@link SequenceFile} format. This job uses a fixed limit on the maximum memory used by the feature chunk
 * per node thereby splitting the process across multiple map/reduces.
 * //w w  w .j  a v  a  2s . c  om
 * @param input
 *          input directory of the vectors in {@link SequenceFile} format
 * @param output
 *          output directory where {@link org.apache.mahout.math.RandomAccessSparseVector}'s of the document
 *          are generated
 * @param chunkSizeInMegabytes
 *          the size in MB of the feature => id chunk to be kept in memory at each node during Map/Reduce
 *          stage. Its recommended you calculated this based on the number of cores and the free memory
 *          available to you per node. Say, you have 2 cores and around 1GB extra memory to spare we
 *          recommend you use a split size of around 400-500MB so that two simultaneous reducers can create
 *          partial vectors without thrashing the system due to increased swapping
 * @param minDf
 *          The minimum document frequency. Default 1
 * @param maxDFPercent
 *          The max percentage of vectors for the DF. Can be used to remove really high frequency features.
 *          Expressed as an integer between 0 and 100. Default 99
 * @param numReducers 
 *          The number of reducers to spawn. This also affects the possible parallelism since each reducer
 *          will typically produce a single output file containing tf-idf vectors for a subset of the
 *          documents in the corpus.
 * @throws IOException
 * @throws ClassNotFoundException 
 * @throws InterruptedException 
 */
public static void processTfIdf(Path input, Path output, int chunkSizeInMegabytes, int minDf, int maxDFPercent,
        float normPower, boolean sequentialAccessOutput, int numReducers)
        throws IOException, InterruptedException, ClassNotFoundException {
    if (chunkSizeInMegabytes < MIN_CHUNKSIZE) {
        chunkSizeInMegabytes = MIN_CHUNKSIZE;
    } else if (chunkSizeInMegabytes > MAX_CHUNKSIZE) { // 10GB
        chunkSizeInMegabytes = MAX_CHUNKSIZE;
    }

    if (normPower != PartialVectorMerger.NO_NORMALIZING && normPower < 0) {
        throw new IllegalArgumentException("normPower must either be -1 or >= 0");
    }

    if (minDf < 1) {
        minDf = 1;
    }
    if (maxDFPercent < 0 || maxDFPercent > 100) {
        maxDFPercent = 99;
    }

    Path wordCountPath = new Path(output, WORDCOUNT_OUTPUT_FOLDER);

    startDFCounting(input, wordCountPath);
    Pair<Long[], List<Path>> datasetFeatures = createDictionaryChunks(wordCountPath, output,
            chunkSizeInMegabytes);

    int partialVectorIndex = 0;
    List<Path> partialVectorPaths = new ArrayList<Path>();
    List<Path> dictionaryChunks = datasetFeatures.getSecond();
    for (Path dictionaryChunk : dictionaryChunks) {
        Path partialVectorOutputPath = new Path(output, VECTOR_OUTPUT_FOLDER + partialVectorIndex++);
        partialVectorPaths.add(partialVectorOutputPath);
        makePartialVectors(input, datasetFeatures.getFirst()[0], datasetFeatures.getFirst()[1], minDf,
                maxDFPercent, dictionaryChunk, partialVectorOutputPath, sequentialAccessOutput);
    }

    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(partialVectorPaths.get(0).toUri(), conf);

    Path outputDir = new Path(output, DOCUMENT_VECTOR_OUTPUT_FOLDER);
    if (dictionaryChunks.size() > 1) {
        PartialVectorMerger.mergePartialVectors(partialVectorPaths, outputDir, normPower,
                datasetFeatures.getFirst()[0].intValue(), sequentialAccessOutput, numReducers);
        HadoopUtil.deletePaths(partialVectorPaths, fs);
    } else {
        Path singlePartialVectorOutputPath = partialVectorPaths.get(0);
        fs.delete(outputDir, true);
        fs.rename(singlePartialVectorOutputPath, outputDir);
    }
}

From source file:org.apache.metron.writer.hdfs.SourceAwareMoveAction.java

License:Apache License

@Override
public void execute(FileSystem fileSystem, Path filePath) throws IOException {
    Path destPath = new Path(new Path(destination, getSource(filePath)), filePath.getName());
    LOG.info("Moving file " + filePath + " to " + destPath);
    boolean success = fileSystem.rename(filePath, destPath);
    return;/*from w w  w .  j a va  2s  .  com*/
}

From source file:org.apache.nifi.processors.hadoop.AbstractPutHDFSRecord.java

License:Apache License

/**
 * Attempts to rename srcFile to destFile up to 10 times, with a 200ms sleep in between each attempt.
 *
 * If the file has not been renamed after 10 attempts, a FailureException is thrown.
 *
 * @param fileSystem the file system where the files are located
 * @param srcFile the source file/*from  w  ww  .  j  a  v a 2s  . c o  m*/
 * @param destFile the destination file to rename the source to
 * @throws IOException if IOException happens while attempting to rename
 * @throws InterruptedException if renaming is interrupted
 * @throws FailureException if the file couldn't be renamed after 10 attempts
 */
protected void rename(final FileSystem fileSystem, final Path srcFile, final Path destFile)
        throws IOException, InterruptedException, FailureException {
    boolean renamed = false;
    for (int i = 0; i < 10; i++) { // try to rename multiple times.
        if (fileSystem.rename(srcFile, destFile)) {
            renamed = true;
            break;// rename was successful
        }
        Thread.sleep(200L);// try waiting to let whatever might cause rename failure to resolve
    }
    if (!renamed) {
        fileSystem.delete(srcFile, false);
        throw new FailureException("Could not rename file " + srcFile + " to its final filename");
    }
}

From source file:org.apache.nifi.processors.hadoop.MoveHDFS.java

License:Apache License

protected void processBatchOfFiles(final List<Path> files, final ProcessContext context,
        final ProcessSession session, FlowFile parentFlowFile) {
    Preconditions.checkState(parentFlowFile != null, "No parent flowfile for this batch was provided");

    // process the batch of files
    final Configuration conf = getConfiguration();
    final FileSystem hdfs = getFileSystem();
    final UserGroupInformation ugi = getUserGroupInformation();

    if (conf == null || ugi == null) {
        getLogger().error("Configuration or UserGroupInformation not configured properly");
        session.transfer(parentFlowFile, REL_FAILURE);
        context.yield();/*w w w .j  a v  a2s  .co  m*/
        return;
    }

    for (final Path file : files) {

        ugi.doAs(new PrivilegedAction<Object>() {
            @Override
            public Object run() {
                FlowFile flowFile = session.create(parentFlowFile);
                try {
                    final String originalFilename = file.getName();
                    final Path configuredRootOutputDirPath = processorConfig.getOutputDirectory();
                    final Path newFile = new Path(configuredRootOutputDirPath, originalFilename);
                    final boolean destinationExists = hdfs.exists(newFile);
                    // If destination file already exists, resolve that
                    // based on processor configuration
                    if (destinationExists) {
                        switch (processorConfig.getConflictResolution()) {
                        case REPLACE_RESOLUTION:
                            if (hdfs.delete(file, false)) {
                                getLogger().info("deleted {} in order to replace with the contents of {}",
                                        new Object[] { file, flowFile });
                            }
                            break;
                        case IGNORE_RESOLUTION:
                            session.transfer(flowFile, REL_SUCCESS);
                            getLogger().info(
                                    "transferring {} to success because file with same name already exists",
                                    new Object[] { flowFile });
                            return null;
                        case FAIL_RESOLUTION:
                            session.transfer(session.penalize(flowFile), REL_FAILURE);
                            getLogger().warn(
                                    "penalizing {} and routing to failure because file with same name already exists",
                                    new Object[] { flowFile });
                            return null;
                        default:
                            break;
                        }
                    }

                    // Create destination directory if it does not exist
                    try {
                        if (!hdfs.getFileStatus(configuredRootOutputDirPath).isDirectory()) {
                            throw new IOException(configuredRootOutputDirPath.toString()
                                    + " already exists and is not a directory");
                        }
                    } catch (FileNotFoundException fe) {
                        if (!hdfs.mkdirs(configuredRootOutputDirPath)) {
                            throw new IOException(
                                    configuredRootOutputDirPath.toString() + " could not be created");
                        }
                        changeOwner(context, hdfs, configuredRootOutputDirPath);
                    }

                    boolean moved = false;
                    for (int i = 0; i < 10; i++) { // try to rename multiple
                        // times.
                        if (processorConfig.getOperation().equals("move")) {
                            if (hdfs.rename(file, newFile)) {
                                moved = true;
                                break;// rename was successful
                            }
                        } else {
                            if (FileUtil.copy(hdfs, file, hdfs, newFile, false, conf)) {
                                moved = true;
                                break;// copy was successful
                            }
                        }
                        Thread.sleep(200L);// try waiting to let whatever might cause rename failure to resolve
                    }
                    if (!moved) {
                        throw new ProcessException("Could not move file " + file + " to its final filename");
                    }

                    changeOwner(context, hdfs, newFile);
                    final String outputPath = newFile.toString();
                    final String newFilename = newFile.getName();
                    final String hdfsPath = newFile.getParent().toString();
                    flowFile = session.putAttribute(flowFile, CoreAttributes.FILENAME.key(), newFilename);
                    flowFile = session.putAttribute(flowFile, ABSOLUTE_HDFS_PATH_ATTRIBUTE, hdfsPath);
                    final String transitUri = (outputPath.startsWith("/")) ? "hdfs:/" + outputPath
                            : "hdfs://" + outputPath;
                    session.getProvenanceReporter().send(flowFile, transitUri);
                    session.transfer(flowFile, REL_SUCCESS);

                } catch (final Throwable t) {
                    getLogger().error("Failed to rename on HDFS due to {}", new Object[] { t });
                    session.transfer(session.penalize(flowFile), REL_FAILURE);
                    context.yield();
                }
                return null;
            }
        });
    }
}

From source file:org.apache.nifi.processors.hadoop.PutHDFS.java

License:Apache License

@Override
public void onTrigger(ProcessContext context, ProcessSession session) throws ProcessException {
    final FlowFile flowFile = session.get();
    if (flowFile == null) {
        return;//w w  w . j a  v  a 2  s .c o  m
    }

    final FileSystem hdfs = getFileSystem();
    final Configuration configuration = getConfiguration();
    final UserGroupInformation ugi = getUserGroupInformation();

    if (configuration == null || hdfs == null || ugi == null) {
        getLogger().error("HDFS not configured properly");
        session.transfer(flowFile, REL_FAILURE);
        context.yield();
        return;
    }

    ugi.doAs(new PrivilegedAction<Object>() {
        @Override
        public Object run() {
            Path tempDotCopyFile = null;
            FlowFile putFlowFile = flowFile;
            try {
                final String dirValue = context.getProperty(DIRECTORY).evaluateAttributeExpressions(putFlowFile)
                        .getValue();
                final Path configuredRootDirPath = new Path(dirValue);

                final String conflictResponse = context.getProperty(CONFLICT_RESOLUTION).getValue();

                final Double blockSizeProp = context.getProperty(BLOCK_SIZE).asDataSize(DataUnit.B);
                final long blockSize = blockSizeProp != null ? blockSizeProp.longValue()
                        : hdfs.getDefaultBlockSize(configuredRootDirPath);

                final Double bufferSizeProp = context.getProperty(BUFFER_SIZE).asDataSize(DataUnit.B);
                final int bufferSize = bufferSizeProp != null ? bufferSizeProp.intValue()
                        : configuration.getInt(BUFFER_SIZE_KEY, BUFFER_SIZE_DEFAULT);

                final Integer replicationProp = context.getProperty(REPLICATION_FACTOR).asInteger();
                final short replication = replicationProp != null ? replicationProp.shortValue()
                        : hdfs.getDefaultReplication(configuredRootDirPath);

                final CompressionCodec codec = getCompressionCodec(context, configuration);

                final String filename = codec != null
                        ? putFlowFile.getAttribute(CoreAttributes.FILENAME.key()) + codec.getDefaultExtension()
                        : putFlowFile.getAttribute(CoreAttributes.FILENAME.key());

                final Path tempCopyFile = new Path(configuredRootDirPath, "." + filename);
                final Path copyFile = new Path(configuredRootDirPath, filename);

                // Create destination directory if it does not exist
                try {
                    if (!hdfs.getFileStatus(configuredRootDirPath).isDirectory()) {
                        throw new IOException(
                                configuredRootDirPath.toString() + " already exists and is not a directory");
                    }
                } catch (FileNotFoundException fe) {
                    if (!hdfs.mkdirs(configuredRootDirPath)) {
                        throw new IOException(configuredRootDirPath.toString() + " could not be created");
                    }
                    changeOwner(context, hdfs, configuredRootDirPath, flowFile);
                }

                final boolean destinationExists = hdfs.exists(copyFile);

                // If destination file already exists, resolve that based on processor configuration
                if (destinationExists) {
                    switch (conflictResponse) {
                    case REPLACE_RESOLUTION:
                        if (hdfs.delete(copyFile, false)) {
                            getLogger().info("deleted {} in order to replace with the contents of {}",
                                    new Object[] { copyFile, putFlowFile });
                        }
                        break;
                    case IGNORE_RESOLUTION:
                        session.transfer(putFlowFile, REL_SUCCESS);
                        getLogger().info(
                                "transferring {} to success because file with same name already exists",
                                new Object[] { putFlowFile });
                        return null;
                    case FAIL_RESOLUTION:
                        session.transfer(session.penalize(putFlowFile), REL_FAILURE);
                        getLogger().warn(
                                "penalizing {} and routing to failure because file with same name already exists",
                                new Object[] { putFlowFile });
                        return null;
                    default:
                        break;
                    }
                }

                // Write FlowFile to temp file on HDFS
                final StopWatch stopWatch = new StopWatch(true);
                session.read(putFlowFile, new InputStreamCallback() {

                    @Override
                    public void process(InputStream in) throws IOException {
                        OutputStream fos = null;
                        Path createdFile = null;
                        try {
                            if (conflictResponse.equals(APPEND_RESOLUTION_AV.getValue()) && destinationExists) {
                                fos = hdfs.append(copyFile, bufferSize);
                            } else {
                                fos = hdfs.create(tempCopyFile, true, bufferSize, replication, blockSize);
                            }
                            if (codec != null) {
                                fos = codec.createOutputStream(fos);
                            }
                            createdFile = tempCopyFile;
                            BufferedInputStream bis = new BufferedInputStream(in);
                            StreamUtils.copy(bis, fos);
                            bis = null;
                            fos.flush();
                        } finally {
                            try {
                                if (fos != null) {
                                    fos.close();
                                }
                            } catch (RemoteException re) {
                                // when talking to remote HDFS clusters, we don't notice problems until fos.close()
                                if (createdFile != null) {
                                    try {
                                        hdfs.delete(createdFile, false);
                                    } catch (Throwable ignore) {
                                    }
                                }
                                throw re;
                            } catch (Throwable ignore) {
                            }
                            fos = null;
                        }
                    }

                });
                stopWatch.stop();
                final String dataRate = stopWatch.calculateDataRate(putFlowFile.getSize());
                final long millis = stopWatch.getDuration(TimeUnit.MILLISECONDS);
                tempDotCopyFile = tempCopyFile;

                if (!conflictResponse.equals(APPEND_RESOLUTION_AV.getValue())
                        || (conflictResponse.equals(APPEND_RESOLUTION_AV.getValue()) && !destinationExists)) {
                    boolean renamed = false;
                    for (int i = 0; i < 10; i++) { // try to rename multiple times.
                        if (hdfs.rename(tempCopyFile, copyFile)) {
                            renamed = true;
                            break;// rename was successful
                        }
                        Thread.sleep(200L);// try waiting to let whatever might cause rename failure to resolve
                    }
                    if (!renamed) {
                        hdfs.delete(tempCopyFile, false);
                        throw new ProcessException("Copied file to HDFS but could not rename dot file "
                                + tempCopyFile + " to its final filename");
                    }

                    changeOwner(context, hdfs, copyFile, flowFile);
                }

                getLogger().info("copied {} to HDFS at {} in {} milliseconds at a rate of {}",
                        new Object[] { putFlowFile, copyFile, millis, dataRate });

                final String newFilename = copyFile.getName();
                final String hdfsPath = copyFile.getParent().toString();
                putFlowFile = session.putAttribute(putFlowFile, CoreAttributes.FILENAME.key(), newFilename);
                putFlowFile = session.putAttribute(putFlowFile, ABSOLUTE_HDFS_PATH_ATTRIBUTE, hdfsPath);
                final Path qualifiedPath = copyFile.makeQualified(hdfs.getUri(), hdfs.getWorkingDirectory());
                session.getProvenanceReporter().send(putFlowFile, qualifiedPath.toString());

                session.transfer(putFlowFile, REL_SUCCESS);

            } catch (final Throwable t) {
                if (tempDotCopyFile != null) {
                    try {
                        hdfs.delete(tempDotCopyFile, false);
                    } catch (Exception e) {
                        getLogger().error("Unable to remove temporary file {} due to {}",
                                new Object[] { tempDotCopyFile, e });
                    }
                }
                getLogger().error("Failed to write to HDFS due to {}", new Object[] { t });
                session.transfer(session.penalize(putFlowFile), REL_FAILURE);
                context.yield();
            }

            return null;
        }
    });
}

From source file:org.apache.nutch.admin.index.IndexThread.java

License:Apache License

public void run() {
    try {/*from   www.  j av a  2 s .c o m*/
        //      this.fMessage = "index.running";
        FileSystem fs = FileSystem.get(this.conf);
        if (fs.exists(this.indexDir)) {
            Path backup_path = new Path(this.conf.get("searcher.dir"), "index_backup_" + new Date().getTime());
            System.out.println("Backing up " + this.indexDir + " as " + backup_path + "...");
            fs.rename(this.indexDir, backup_path);
            System.out.println("Done.");
        }

        System.out.println("Re-indexing in " + this.indexDir.toString());
        this.indexer.index(this.indexDir, this.tableName);
    } catch (IOException e) {
        LOG.warning(e.toString());
        this.fMessage = e.toString();
    }
}

From source file:org.apache.nutch.crawl.CrawlDbMerger.java

License:Apache License

public void merge(Path output, Path[] dbs, boolean normalize, boolean filter) throws Exception {
    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("CrawlDb merge: starting at " + sdf.format(start));

    JobConf job = createMergeJob(getConf(), output, normalize, filter);
    for (int i = 0; i < dbs.length; i++) {
        if (LOG.isInfoEnabled()) {
            LOG.info("Adding " + dbs[i]);
        }//  w w w  .  jav a 2s.c  o  m
        FileInputFormat.addInputPath(job, new Path(dbs[i], CrawlDb.CURRENT_NAME));
    }
    JobClient.runJob(job);
    FileSystem fs = FileSystem.get(getConf());
    fs.mkdirs(output);
    fs.rename(FileOutputFormat.getOutputPath(job), new Path(output, CrawlDb.CURRENT_NAME));
    long end = System.currentTimeMillis();
    LOG.info("CrawlDb merge: finished at " + sdf.format(end) + ", elapsed: "
            + TimingUtil.elapsedTime(start, end));
}

From source file:org.apache.nutch.crawl.LinkDb.java

License:Apache License

public static void install(JobConf job, Path linkDb) throws IOException {
    Path newLinkDb = FileOutputFormat.getOutputPath(job);
    FileSystem fs = new JobClient(job).getFs();
    Path old = new Path(linkDb, "old");
    Path current = new Path(linkDb, CURRENT_NAME);
    if (fs.exists(current)) {
        if (fs.exists(old))
            fs.delete(old, true);//from w  ww .  j a  va 2s . co  m
        fs.rename(current, old);
    }
    fs.mkdirs(linkDb);
    fs.rename(newLinkDb, current);
    if (fs.exists(old))
        fs.delete(old, true);
    LockUtil.removeLockFile(fs, new Path(linkDb, LOCK_NAME));
}