Example usage for org.apache.hadoop.fs FileSystem rename

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem rename.

Prototype

public abstract boolean rename(Path src, Path dst) throws IOException;

Source Link

Document

Renames Path src to Path dst.

Usage

From source file:org.apache.mahout.math.hadoop.stochasticsvd.SSVDCli.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    addInputOption();//from w w  w .  j a  v a 2s. c om
    addOutputOption();
    addOption("rank", "k", "decomposition rank", true);
    addOption("oversampling", "p", "oversampling", String.valueOf(15));
    addOption("blockHeight", "r", "Y block height (must be > (k+p))", String.valueOf(10000));
    addOption("outerProdBlockHeight", "oh",
            "block height of outer products during multiplication, increase for sparse inputs",
            String.valueOf(30000));
    addOption("abtBlockHeight", "abth",
            "block height of Y_i in ABtJob during AB' multiplication, increase for extremely sparse inputs",
            String.valueOf(200000));
    addOption("minSplitSize", "s", "minimum split size", String.valueOf(-1));
    addOption("computeU", "U", "compute U (true/false)", String.valueOf(true));
    addOption("uHalfSigma", "uhs", "Compute U * Sigma^0.5", String.valueOf(false));
    addOption("uSigma", "us", "Compute U * Sigma", String.valueOf(false));
    addOption("computeV", "V", "compute V (true/false)", String.valueOf(true));
    addOption("vHalfSigma", "vhs", "compute V * Sigma^0.5", String.valueOf(false));
    addOption("reduceTasks", "t", "number of reduce tasks (where applicable)", true);
    addOption("powerIter", "q", "number of additional power iterations (0..2 is good)", String.valueOf(0));
    addOption("broadcast", "br", "whether use distributed cache to broadcast matrices wherever possible",
            String.valueOf(true));
    addOption("pca", "pca", "run in pca mode: compute column-wise mean and subtract from input",
            String.valueOf(false));
    addOption("pcaOffset", "xi", "path(glob) of external pca mean (optional, dont compute, use external mean");
    addOption(DefaultOptionCreator.overwriteOption().create());

    Map<String, List<String>> pargs = parseArguments(args);
    if (pargs == null) {
        return -1;
    }

    int k = Integer.parseInt(getOption("rank"));
    int p = Integer.parseInt(getOption("oversampling"));
    int r = Integer.parseInt(getOption("blockHeight"));
    int h = Integer.parseInt(getOption("outerProdBlockHeight"));
    int abh = Integer.parseInt(getOption("abtBlockHeight"));
    int q = Integer.parseInt(getOption("powerIter"));
    int minSplitSize = Integer.parseInt(getOption("minSplitSize"));
    boolean computeU = Boolean.parseBoolean(getOption("computeU"));
    boolean computeV = Boolean.parseBoolean(getOption("computeV"));
    boolean cUHalfSigma = Boolean.parseBoolean(getOption("uHalfSigma"));
    boolean cUSigma = Boolean.parseBoolean(getOption("uSigma"));
    boolean cVHalfSigma = Boolean.parseBoolean(getOption("vHalfSigma"));
    int reduceTasks = Integer.parseInt(getOption("reduceTasks"));
    boolean broadcast = Boolean.parseBoolean(getOption("broadcast"));
    String xiPathStr = getOption("pcaOffset");
    Path xiPath = xiPathStr == null ? null : new Path(xiPathStr);
    boolean pca = Boolean.parseBoolean(getOption("pca")) || xiPath != null;

    boolean overwrite = hasOption(DefaultOptionCreator.OVERWRITE_OPTION);

    Configuration conf = getConf();
    if (conf == null) {
        throw new IOException("No Hadoop configuration present");
    }

    Path[] inputPaths = { getInputPath() };
    Path tempPath = getTempPath();
    FileSystem fs = FileSystem.get(getTempPath().toUri(), conf);

    // housekeeping
    if (overwrite) {
        // clear the output path
        HadoopUtil.delete(getConf(), getOutputPath());
        // clear the temp path
        HadoopUtil.delete(getConf(), getTempPath());
    }

    fs.mkdirs(getOutputPath());

    // MAHOUT-817
    if (pca && xiPath == null) {
        xiPath = new Path(tempPath, "xi");
        if (overwrite) {
            fs.delete(xiPath, true);
        }
        MatrixColumnMeansJob.run(conf, inputPaths[0], xiPath);
    }

    SSVDSolver solver = new SSVDSolver(conf, inputPaths, new Path(tempPath, "ssvd"), r, k, p, reduceTasks);

    solver.setMinSplitSize(minSplitSize);
    solver.setComputeU(computeU);
    solver.setComputeV(computeV);
    solver.setcUHalfSigma(cUHalfSigma);
    solver.setcVHalfSigma(cVHalfSigma);
    solver.setcUSigma(cUSigma);
    solver.setOuterBlockHeight(h);
    solver.setAbtBlockHeight(abh);
    solver.setQ(q);
    solver.setBroadcast(broadcast);
    solver.setOverwrite(overwrite);

    if (xiPath != null) {
        solver.setPcaMeanPath(new Path(xiPath, "part-*"));
    }

    solver.run();

    Vector svalues = solver.getSingularValues().viewPart(0, k);
    SSVDHelper.saveVector(svalues, getOutputPath("sigma"), conf);

    if (computeU && !fs.rename(new Path(solver.getUPath()), getOutputPath())) {
        throw new IOException("Unable to move U results to the output path.");
    }
    if (cUHalfSigma && !fs.rename(new Path(solver.getuHalfSigmaPath()), getOutputPath())) {
        throw new IOException("Unable to move U*Sigma^0.5 results to the output path.");
    }
    if (cUSigma && !fs.rename(new Path(solver.getuSigmaPath()), getOutputPath())) {
        throw new IOException("Unable to move U*Sigma results to the output path.");
    }
    if (computeV && !fs.rename(new Path(solver.getVPath()), getOutputPath())) {
        throw new IOException("Unable to move V results to the output path.");
    }
    if (cVHalfSigma && !fs.rename(new Path(solver.getvHalfSigmaPath()), getOutputPath())) {
        throw new IOException("Unable to move V*Sigma^0.5 results to the output path.");
    }

    // Delete the temp path on exit
    fs.deleteOnExit(getTempPath());

    return 0;
}

From source file:org.apache.mahout.utils.vectors.text.DictionaryVectorizer.java

License:Apache License

/**
 * Create Term Frequency (Tf) Vectors from the input set of documents in {@link SequenceFile} format. This
 * tries to fix the maximum memory used by the feature chunk per node thereby splitting the process across
 * multiple map/reduces./*  w  ww  .  j  a  v  a2  s . c  o  m*/
 * 
 * @param input
 *          input directory of the documents in {@link SequenceFile} format
 * @param output
 *          output directory where {@link org.apache.mahout.math.RandomAccessSparseVector}'s of the document
 *          are generated
 * @param minSupport
 *          the minimum frequency of the feature in the entire corpus to be considered for inclusion in the
 *          sparse vector
 * @param maxNGramSize
 *          1 = unigram, 2 = unigram and bigram, 3 = unigram, bigram and trigram
 * @param minLLRValue
 *          minValue of log likelihood ratio to used to prune ngrams
 * @param chunkSizeInMegabytes
 *          the size in MB of the feature => id chunk to be kept in memory at each node during Map/Reduce
 *          stage. Its recommended you calculated this based on the number of cores and the free memory
 *          available to you per node. Say, you have 2 cores and around 1GB extra memory to spare we
 *          recommend you use a split size of around 400-500MB so that two simultaneous reducers can create
 *          partial vectors without thrashing the system due to increased swapping
 * @throws IOException
 * @throws ClassNotFoundException 
 * @throws InterruptedException 
 */
public static void createTermFrequencyVectors(Path input, Path output, Configuration baseConf, int minSupport,
        int maxNGramSize, float minLLRValue, int numReducers, int chunkSizeInMegabytes,
        boolean sequentialAccess) throws IOException, InterruptedException, ClassNotFoundException {
    if (chunkSizeInMegabytes < MIN_CHUNKSIZE) {
        chunkSizeInMegabytes = MIN_CHUNKSIZE;
    } else if (chunkSizeInMegabytes > MAX_CHUNKSIZE) { // 10GB
        chunkSizeInMegabytes = MAX_CHUNKSIZE;
    }
    if (minSupport < 0) {
        minSupport = DEFAULT_MIN_SUPPORT;
    }

    Path dictionaryJobPath = new Path(output, DICTIONARY_JOB_FOLDER);

    int[] maxTermDimension = new int[1];
    List<Path> dictionaryChunks;
    if (maxNGramSize == 1) {
        startWordCounting(input, dictionaryJobPath, minSupport);
        dictionaryChunks = createDictionaryChunks(minSupport, dictionaryJobPath, output, chunkSizeInMegabytes,
                new LongWritable(), maxTermDimension);
    } else {
        CollocDriver.generateAllGrams(input, dictionaryJobPath, baseConf, maxNGramSize, minSupport, minLLRValue,
                numReducers);
        dictionaryChunks = createDictionaryChunks(minSupport,
                new Path(new Path(output, DICTIONARY_JOB_FOLDER), CollocDriver.NGRAM_OUTPUT_DIRECTORY), output,
                chunkSizeInMegabytes, new DoubleWritable(), maxTermDimension);
    }

    int partialVectorIndex = 0;
    List<Path> partialVectorPaths = new ArrayList<Path>();
    for (Path dictionaryChunk : dictionaryChunks) {
        Path partialVectorOutputPath = new Path(output, VECTOR_OUTPUT_FOLDER + partialVectorIndex++);
        partialVectorPaths.add(partialVectorOutputPath);
        makePartialVectors(input, maxNGramSize, dictionaryChunk, partialVectorOutputPath, maxTermDimension[0],
                sequentialAccess, numReducers);
    }

    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(partialVectorPaths.get(0).toUri(), conf);

    Path outputDir = new Path(output, DOCUMENT_VECTOR_OUTPUT_FOLDER);
    if (dictionaryChunks.size() > 1) {
        PartialVectorMerger.mergePartialVectors(partialVectorPaths, outputDir, -1, maxTermDimension[0],
                sequentialAccess, numReducers);
        HadoopUtil.deletePaths(partialVectorPaths, fs);
    } else {
        Path singlePartialVectorOutputPath = partialVectorPaths.get(0);
        fs.delete(outputDir, true);
        fs.rename(singlePartialVectorOutputPath, outputDir);
    }
}

From source file:org.apache.mahout.utils.vectors.tfidf.TFIDFConverter.java

License:Apache License

/**
 * Create Term Frequency-Inverse Document Frequency (Tf-Idf) Vectors from the input set of vectors in
 * {@link SequenceFile} format. This job uses a fixed limit on the maximum memory used by the feature chunk
 * per node thereby splitting the process across multiple map/reduces.
 * //w w  w .j  a v  a  2s . c  om
 * @param input
 *          input directory of the vectors in {@link SequenceFile} format
 * @param output
 *          output directory where {@link org.apache.mahout.math.RandomAccessSparseVector}'s of the document
 *          are generated
 * @param chunkSizeInMegabytes
 *          the size in MB of the feature => id chunk to be kept in memory at each node during Map/Reduce
 *          stage. Its recommended you calculated this based on the number of cores and the free memory
 *          available to you per node. Say, you have 2 cores and around 1GB extra memory to spare we
 *          recommend you use a split size of around 400-500MB so that two simultaneous reducers can create
 *          partial vectors without thrashing the system due to increased swapping
 * @param minDf
 *          The minimum document frequency. Default 1
 * @param maxDFPercent
 *          The max percentage of vectors for the DF. Can be used to remove really high frequency features.
 *          Expressed as an integer between 0 and 100. Default 99
 * @param numReducers 
 *          The number of reducers to spawn. This also affects the possible parallelism since each reducer
 *          will typically produce a single output file containing tf-idf vectors for a subset of the
 *          documents in the corpus.
 * @throws IOException
 * @throws ClassNotFoundException 
 * @throws InterruptedException 
 */
public static void processTfIdf(Path input, Path output, int chunkSizeInMegabytes, int minDf, int maxDFPercent,
        float normPower, boolean sequentialAccessOutput, int numReducers)
        throws IOException, InterruptedException, ClassNotFoundException {
    if (chunkSizeInMegabytes < MIN_CHUNKSIZE) {
        chunkSizeInMegabytes = MIN_CHUNKSIZE;
    } else if (chunkSizeInMegabytes > MAX_CHUNKSIZE) { // 10GB
        chunkSizeInMegabytes = MAX_CHUNKSIZE;
    }

    if (normPower != PartialVectorMerger.NO_NORMALIZING && normPower < 0) {
        throw new IllegalArgumentException("normPower must either be -1 or >= 0");
    }

    if (minDf < 1) {
        minDf = 1;
    }
    if (maxDFPercent < 0 || maxDFPercent > 100) {
        maxDFPercent = 99;
    }

    Path wordCountPath = new Path(output, WORDCOUNT_OUTPUT_FOLDER);

    startDFCounting(input, wordCountPath);
    Pair<Long[], List<Path>> datasetFeatures = createDictionaryChunks(wordCountPath, output,
            chunkSizeInMegabytes);

    int partialVectorIndex = 0;
    List<Path> partialVectorPaths = new ArrayList<Path>();
    List<Path> dictionaryChunks = datasetFeatures.getSecond();
    for (Path dictionaryChunk : dictionaryChunks) {
        Path partialVectorOutputPath = new Path(output, VECTOR_OUTPUT_FOLDER + partialVectorIndex++);
        partialVectorPaths.add(partialVectorOutputPath);
        makePartialVectors(input, datasetFeatures.getFirst()[0], datasetFeatures.getFirst()[1], minDf,
                maxDFPercent, dictionaryChunk, partialVectorOutputPath, sequentialAccessOutput);
    }

    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(partialVectorPaths.get(0).toUri(), conf);

    Path outputDir = new Path(output, DOCUMENT_VECTOR_OUTPUT_FOLDER);
    if (dictionaryChunks.size() > 1) {
        PartialVectorMerger.mergePartialVectors(partialVectorPaths, outputDir, normPower,
                datasetFeatures.getFirst()[0].intValue(), sequentialAccessOutput, numReducers);
        HadoopUtil.deletePaths(partialVectorPaths, fs);
    } else {
        Path singlePartialVectorOutputPath = partialVectorPaths.get(0);
        fs.delete(outputDir, true);
        fs.rename(singlePartialVectorOutputPath, outputDir);
    }
}

From source file:org.apache.metron.writer.hdfs.SourceAwareMoveAction.java

License:Apache License

@Override
public void execute(FileSystem fileSystem, Path filePath) throws IOException {
    Path destPath = new Path(new Path(destination, getSource(filePath)), filePath.getName());
    LOG.info("Moving file " + filePath + " to " + destPath);
    boolean success = fileSystem.rename(filePath, destPath);
    return;/*from w w  w .  j a va  2s  .  com*/
}

From source file:org.apache.nifi.processors.hadoop.AbstractPutHDFSRecord.java

License:Apache License

/**
 * Attempts to rename srcFile to destFile up to 10 times, with a 200ms sleep in between each attempt.
 *
 * If the file has not been renamed after 10 attempts, a FailureException is thrown.
 *
 * @param fileSystem the file system where the files are located
 * @param srcFile the source file/*from  w  ww  .  j  a  v a 2s  . c o  m*/
 * @param destFile the destination file to rename the source to
 * @throws IOException if IOException happens while attempting to rename
 * @throws InterruptedException if renaming is interrupted
 * @throws FailureException if the file couldn't be renamed after 10 attempts
 */
protected void rename(final FileSystem fileSystem, final Path srcFile, final Path destFile)
        throws IOException, InterruptedException, FailureException {
    boolean renamed = false;
    for (int i = 0; i < 10; i++) { // try to rename multiple times.
        if (fileSystem.rename(srcFile, destFile)) {
            renamed = true;
            break;// rename was successful
        }
        Thread.sleep(200L);// try waiting to let whatever might cause rename failure to resolve
    }
    if (!renamed) {
        fileSystem.delete(srcFile, false);
        throw new FailureException("Could not rename file " + srcFile + " to its final filename");
    }
}

From source file:org.apache.nifi.processors.hadoop.MoveHDFS.java

License:Apache License

protected void processBatchOfFiles(final List<Path> files, final ProcessContext context,
        final ProcessSession session, FlowFile parentFlowFile) {
    Preconditions.checkState(parentFlowFile != null, "No parent flowfile for this batch was provided");

    // process the batch of files
    final Configuration conf = getConfiguration();
    final FileSystem hdfs = getFileSystem();
    final UserGroupInformation ugi = getUserGroupInformation();

    if (conf == null || ugi == null) {
        getLogger().error("Configuration or UserGroupInformation not configured properly");
        session.transfer(parentFlowFile, REL_FAILURE);
        context.yield();/*w w w .j  a v  a2s  .co  m*/
        return;
    }

    for (final Path file : files) {

        ugi.doAs(new PrivilegedAction<Object>() {
            @Override
            public Object run() {
                FlowFile flowFile = session.create(parentFlowFile);
                try {
                    final String originalFilename = file.getName();
                    final Path configuredRootOutputDirPath = processorConfig.getOutputDirectory();
                    final Path newFile = new Path(configuredRootOutputDirPath, originalFilename);
                    final boolean destinationExists = hdfs.exists(newFile);
                    // If destination file already exists, resolve that
                    // based on processor configuration
                    if (destinationExists) {
                        switch (processorConfig.getConflictResolution()) {
                        case REPLACE_RESOLUTION:
                            if (hdfs.delete(file, false)) {
                                getLogger().info("deleted {} in order to replace with the contents of {}",
                                        new Object[] { file, flowFile });
                            }
                            break;
                        case IGNORE_RESOLUTION:
                            session.transfer(flowFile, REL_SUCCESS);
                            getLogger().info(
                                    "transferring {} to success because file with same name already exists",
                                    new Object[] { flowFile });
                            return null;
                        case FAIL_RESOLUTION:
                            session.transfer(session.penalize(flowFile), REL_FAILURE);
                            getLogger().warn(
                                    "penalizing {} and routing to failure because file with same name already exists",
                                    new Object[] { flowFile });
                            return null;
                        default:
                            break;
                        }
                    }

                    // Create destination directory if it does not exist
                    try {
                        if (!hdfs.getFileStatus(configuredRootOutputDirPath).isDirectory()) {
                            throw new IOException(configuredRootOutputDirPath.toString()
                                    + " already exists and is not a directory");
                        }
                    } catch (FileNotFoundException fe) {
                        if (!hdfs.mkdirs(configuredRootOutputDirPath)) {
                            throw new IOException(
                                    configuredRootOutputDirPath.toString() + " could not be created");
                        }
                        changeOwner(context, hdfs, configuredRootOutputDirPath);
                    }

                    boolean moved = false;
                    for (int i = 0; i < 10; i++) { // try to rename multiple
                        // times.
                        if (processorConfig.getOperation().equals("move")) {
                            if (hdfs.rename(file, newFile)) {
                                moved = true;
                                break;// rename was successful
                            }
                        } else {
                            if (FileUtil.copy(hdfs, file, hdfs, newFile, false, conf)) {
                                moved = true;
                                break;// copy was successful
                            }
                        }
                        Thread.sleep(200L);// try waiting to let whatever might cause rename failure to resolve
                    }
                    if (!moved) {
                        throw new ProcessException("Could not move file " + file + " to its final filename");
                    }

                    changeOwner(context, hdfs, newFile);
                    final String outputPath = newFile.toString();
                    final String newFilename = newFile.getName();
                    final String hdfsPath = newFile.getParent().toString();
                    flowFile = session.putAttribute(flowFile, CoreAttributes.FILENAME.key(), newFilename);
                    flowFile = session.putAttribute(flowFile, ABSOLUTE_HDFS_PATH_ATTRIBUTE, hdfsPath);
                    final String transitUri = (outputPath.startsWith("/")) ? "hdfs:/" + outputPath
                            : "hdfs://" + outputPath;
                    session.getProvenanceReporter().send(flowFile, transitUri);
                    session.transfer(flowFile, REL_SUCCESS);

                } catch (final Throwable t) {
                    getLogger().error("Failed to rename on HDFS due to {}", new Object[] { t });
                    session.transfer(session.penalize(flowFile), REL_FAILURE);
                    context.yield();
                }
                return null;
            }
        });
    }
}

From source file:org.apache.nifi.processors.hadoop.PutHDFS.java

License:Apache License

@Override
public void onTrigger(ProcessContext context, ProcessSession session) throws ProcessException {
    final FlowFile flowFile = session.get();
    if (flowFile == null) {
        return;//w w  w . j a  v  a 2  s .c o  m
    }

    final FileSystem hdfs = getFileSystem();
    final Configuration configuration = getConfiguration();
    final UserGroupInformation ugi = getUserGroupInformation();

    if (configuration == null || hdfs == null || ugi == null) {
        getLogger().error("HDFS not configured properly");
        session.transfer(flowFile, REL_FAILURE);
        context.yield();
        return;
    }

    ugi.doAs(new PrivilegedAction<Object>() {
        @Override
        public Object run() {
            Path tempDotCopyFile = null;
            FlowFile putFlowFile = flowFile;
            try {
                final String dirValue = context.getProperty(DIRECTORY).evaluateAttributeExpressions(putFlowFile)
                        .getValue();
                final Path configuredRootDirPath = new Path(dirValue);

                final String conflictResponse = context.getProperty(CONFLICT_RESOLUTION).getValue();

                final Double blockSizeProp = context.getProperty(BLOCK_SIZE).asDataSize(DataUnit.B);
                final long blockSize = blockSizeProp != null ? blockSizeProp.longValue()
                        : hdfs.getDefaultBlockSize(configuredRootDirPath);

                final Double bufferSizeProp = context.getProperty(BUFFER_SIZE).asDataSize(DataUnit.B);
                final int bufferSize = bufferSizeProp != null ? bufferSizeProp.intValue()
                        : configuration.getInt(BUFFER_SIZE_KEY, BUFFER_SIZE_DEFAULT);

                final Integer replicationProp = context.getProperty(REPLICATION_FACTOR).asInteger();
                final short replication = replicationProp != null ? replicationProp.shortValue()
                        : hdfs.getDefaultReplication(configuredRootDirPath);

                final CompressionCodec codec = getCompressionCodec(context, configuration);

                final String filename = codec != null
                        ? putFlowFile.getAttribute(CoreAttributes.FILENAME.key()) + codec.getDefaultExtension()
                        : putFlowFile.getAttribute(CoreAttributes.FILENAME.key());

                final Path tempCopyFile = new Path(configuredRootDirPath, "." + filename);
                final Path copyFile = new Path(configuredRootDirPath, filename);

                // Create destination directory if it does not exist
                try {
                    if (!hdfs.getFileStatus(configuredRootDirPath).isDirectory()) {
                        throw new IOException(
                                configuredRootDirPath.toString() + " already exists and is not a directory");
                    }
                } catch (FileNotFoundException fe) {
                    if (!hdfs.mkdirs(configuredRootDirPath)) {
                        throw new IOException(configuredRootDirPath.toString() + " could not be created");
                    }
                    changeOwner(context, hdfs, configuredRootDirPath, flowFile);
                }

                final boolean destinationExists = hdfs.exists(copyFile);

                // If destination file already exists, resolve that based on processor configuration
                if (destinationExists) {
                    switch (conflictResponse) {
                    case REPLACE_RESOLUTION:
                        if (hdfs.delete(copyFile, false)) {
                            getLogger().info("deleted {} in order to replace with the contents of {}",
                                    new Object[] { copyFile, putFlowFile });
                        }
                        break;
                    case IGNORE_RESOLUTION:
                        session.transfer(putFlowFile, REL_SUCCESS);
                        getLogger().info(
                                "transferring {} to success because file with same name already exists",
                                new Object[] { putFlowFile });
                        return null;
                    case FAIL_RESOLUTION:
                        session.transfer(session.penalize(putFlowFile), REL_FAILURE);
                        getLogger().warn(
                                "penalizing {} and routing to failure because file with same name already exists",
                                new Object[] { putFlowFile });
                        return null;
                    default:
                        break;
                    }
                }

                // Write FlowFile to temp file on HDFS
                final StopWatch stopWatch = new StopWatch(true);
                session.read(putFlowFile, new InputStreamCallback() {

                    @Override
                    public void process(InputStream in) throws IOException {
                        OutputStream fos = null;
                        Path createdFile = null;
                        try {
                            if (conflictResponse.equals(APPEND_RESOLUTION_AV.getValue()) && destinationExists) {
                                fos = hdfs.append(copyFile, bufferSize);
                            } else {
                                fos = hdfs.create(tempCopyFile, true, bufferSize, replication, blockSize);
                            }
                            if (codec != null) {
                                fos = codec.createOutputStream(fos);
                            }
                            createdFile = tempCopyFile;
                            BufferedInputStream bis = new BufferedInputStream(in);
                            StreamUtils.copy(bis, fos);
                            bis = null;
                            fos.flush();
                        } finally {
                            try {
                                if (fos != null) {
                                    fos.close();
                                }
                            } catch (RemoteException re) {
                                // when talking to remote HDFS clusters, we don't notice problems until fos.close()
                                if (createdFile != null) {
                                    try {
                                        hdfs.delete(createdFile, false);
                                    } catch (Throwable ignore) {
                                    }
                                }
                                throw re;
                            } catch (Throwable ignore) {
                            }
                            fos = null;
                        }
                    }

                });
                stopWatch.stop();
                final String dataRate = stopWatch.calculateDataRate(putFlowFile.getSize());
                final long millis = stopWatch.getDuration(TimeUnit.MILLISECONDS);
                tempDotCopyFile = tempCopyFile;

                if (!conflictResponse.equals(APPEND_RESOLUTION_AV.getValue())
                        || (conflictResponse.equals(APPEND_RESOLUTION_AV.getValue()) && !destinationExists)) {
                    boolean renamed = false;
                    for (int i = 0; i < 10; i++) { // try to rename multiple times.
                        if (hdfs.rename(tempCopyFile, copyFile)) {
                            renamed = true;
                            break;// rename was successful
                        }
                        Thread.sleep(200L);// try waiting to let whatever might cause rename failure to resolve
                    }
                    if (!renamed) {
                        hdfs.delete(tempCopyFile, false);
                        throw new ProcessException("Copied file to HDFS but could not rename dot file "
                                + tempCopyFile + " to its final filename");
                    }

                    changeOwner(context, hdfs, copyFile, flowFile);
                }

                getLogger().info("copied {} to HDFS at {} in {} milliseconds at a rate of {}",
                        new Object[] { putFlowFile, copyFile, millis, dataRate });

                final String newFilename = copyFile.getName();
                final String hdfsPath = copyFile.getParent().toString();
                putFlowFile = session.putAttribute(putFlowFile, CoreAttributes.FILENAME.key(), newFilename);
                putFlowFile = session.putAttribute(putFlowFile, ABSOLUTE_HDFS_PATH_ATTRIBUTE, hdfsPath);
                final Path qualifiedPath = copyFile.makeQualified(hdfs.getUri(), hdfs.getWorkingDirectory());
                session.getProvenanceReporter().send(putFlowFile, qualifiedPath.toString());

                session.transfer(putFlowFile, REL_SUCCESS);

            } catch (final Throwable t) {
                if (tempDotCopyFile != null) {
                    try {
                        hdfs.delete(tempDotCopyFile, false);
                    } catch (Exception e) {
                        getLogger().error("Unable to remove temporary file {} due to {}",
                                new Object[] { tempDotCopyFile, e });
                    }
                }
                getLogger().error("Failed to write to HDFS due to {}", new Object[] { t });
                session.transfer(session.penalize(putFlowFile), REL_FAILURE);
                context.yield();
            }

            return null;
        }
    });
}

From source file:org.apache.nutch.admin.index.IndexThread.java

License:Apache License

public void run() {
    try {/*from   www.  j av a  2 s .c o m*/
        //      this.fMessage = "index.running";
        FileSystem fs = FileSystem.get(this.conf);
        if (fs.exists(this.indexDir)) {
            Path backup_path = new Path(this.conf.get("searcher.dir"), "index_backup_" + new Date().getTime());
            System.out.println("Backing up " + this.indexDir + " as " + backup_path + "...");
            fs.rename(this.indexDir, backup_path);
            System.out.println("Done.");
        }

        System.out.println("Re-indexing in " + this.indexDir.toString());
        this.indexer.index(this.indexDir, this.tableName);
    } catch (IOException e) {
        LOG.warning(e.toString());
        this.fMessage = e.toString();
    }
}

From source file:org.apache.nutch.crawl.CrawlDbMerger.java

License:Apache License

public void merge(Path output, Path[] dbs, boolean normalize, boolean filter) throws Exception {
    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("CrawlDb merge: starting at " + sdf.format(start));

    JobConf job = createMergeJob(getConf(), output, normalize, filter);
    for (int i = 0; i < dbs.length; i++) {
        if (LOG.isInfoEnabled()) {
            LOG.info("Adding " + dbs[i]);
        }//  w w w  .  jav a 2s.c  o  m
        FileInputFormat.addInputPath(job, new Path(dbs[i], CrawlDb.CURRENT_NAME));
    }
    JobClient.runJob(job);
    FileSystem fs = FileSystem.get(getConf());
    fs.mkdirs(output);
    fs.rename(FileOutputFormat.getOutputPath(job), new Path(output, CrawlDb.CURRENT_NAME));
    long end = System.currentTimeMillis();
    LOG.info("CrawlDb merge: finished at " + sdf.format(end) + ", elapsed: "
            + TimingUtil.elapsedTime(start, end));
}

From source file:org.apache.nutch.crawl.LinkDb.java

License:Apache License

public static void install(JobConf job, Path linkDb) throws IOException {
    Path newLinkDb = FileOutputFormat.getOutputPath(job);
    FileSystem fs = new JobClient(job).getFs();
    Path old = new Path(linkDb, "old");
    Path current = new Path(linkDb, CURRENT_NAME);
    if (fs.exists(current)) {
        if (fs.exists(old))
            fs.delete(old, true);//from w  ww .  j a  va 2s . co  m
        fs.rename(current, old);
    }
    fs.mkdirs(linkDb);
    fs.rename(newLinkDb, current);
    if (fs.exists(old))
        fs.delete(old, true);
    LockUtil.removeLockFile(fs, new Path(linkDb, LOCK_NAME));
}