Example usage for org.apache.hadoop.fs FileUtil copy

List of usage examples for org.apache.hadoop.fs FileUtil copy

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileUtil copy.

Prototype

public static boolean copy(FileSystem srcFS, FileStatus srcStatus, FileSystem dstFS, Path dst,
        boolean deleteSource, boolean overwrite, Configuration conf) throws IOException 

Source Link

Document

Copy files between FileSystems.

Usage

From source file:org.apache.kylin.dict.global.GlobalDictHDFSStore.java

License:Apache License

@Override
void prepareForWrite(String workingDir) throws IOException {
    // TODO create lock file
    Path working = new Path(workingDir);

    if (fileSystem.exists(working)) {
        fileSystem.delete(working, true);
        logger.info("Working directory {} exits, delete it first", working);
    }//  w  w  w  .j  a  v a  2s. com

    // when build dict, copy all data into working dir and work on it, avoiding suddenly server crash made data corrupt
    Long[] versions = listAllVersions();
    if (versions.length > 0) {
        Path latestVersion = getVersionDir(versions[versions.length - 1]);
        FileUtil.copy(fileSystem, latestVersion, fileSystem, working, false, true, conf);
    } else {
        fileSystem.mkdirs(working);
    }
}

From source file:org.apache.kylin.dict.global.GlobalDictHDFSStore.java

License:Apache License

@Override
public String copyToAnotherMeta(KylinConfig srcConfig, KylinConfig dstConfig) throws IOException {
    checkArgument(baseDir.startsWith(srcConfig.getHdfsWorkingDirectory()),
            "Please check why current directory {} doesn't belong to source working directory {}", baseDir,
            srcConfig.getHdfsWorkingDirectory());

    final String dstBaseDir = baseDir.replaceFirst(srcConfig.getHdfsWorkingDirectory(),
            dstConfig.getHdfsWorkingDirectory());

    Long[] versions = listAllVersions();
    if (versions.length == 0) { // empty dict, nothing to copy
        return dstBaseDir;
    }//  w w w .j  av  a2  s .  c o m

    Path srcVersionDir = getVersionDir(versions[versions.length - 1]);
    Path dstVersionDir = new Path(srcVersionDir.toString().replaceFirst(srcConfig.getHdfsWorkingDirectory(),
            dstConfig.getHdfsWorkingDirectory()));
    FileSystem dstFS = dstVersionDir.getFileSystem(conf);
    if (dstFS.exists(dstVersionDir)) {
        dstFS.delete(dstVersionDir, true);
    }
    FileUtil.copy(fileSystem, srcVersionDir, dstFS, dstVersionDir, false, true, conf);

    return dstBaseDir;
}

From source file:org.apache.mahout.cf.taste.example.email.MailToPrefsDriver.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    addInputOption();/* ww w . java  2s . com*/
    addOutputOption();
    addOption(DefaultOptionCreator.overwriteOption().create());
    addOption("chunkSize", "cs", "The size of chunks to write.  Default is 100 mb", "100");
    addOption("separator", "sep",
            "The separator used in the input file to separate to, from, subject.  Default is \\n", "\n");
    addOption("from", "f",
            "The position in the input text (value) where the from email is located, starting from "
                    + "zero (0).",
            "0");
    addOption("refs", "r", "The position in the input text (value) where the reference ids are located, "
            + "starting from zero (0).", "1");
    addOption(buildOption("useCounts", "u",
            "If set, then use the number of times the user has interacted with a "
                    + "thread as an indication of their preference.  Otherwise, use boolean preferences.",
            false, false, String.valueOf(true)));
    Map<String, List<String>> parsedArgs = parseArguments(args);

    Path input = getInputPath();
    Path output = getOutputPath();
    int chunkSize = Integer.parseInt(getOption("chunkSize"));
    String separator = getOption("separator");
    Configuration conf = getConf();
    boolean useCounts = hasOption("useCounts");
    AtomicInteger currentPhase = new AtomicInteger();
    int[] msgDim = new int[1];
    //TODO: mod this to not do so many passes over the data.  Dictionary creation could probably be a chain mapper
    List<Path> msgIdChunks = null;
    boolean overwrite = hasOption(DefaultOptionCreator.OVERWRITE_OPTION);
    // create the dictionary between message ids and longs
    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        //TODO: there seems to be a pattern emerging for dictionary creation
        // -- sparse vectors from seq files also has this.
        Path msgIdsPath = new Path(output, "msgIds");
        if (overwrite) {
            HadoopUtil.delete(conf, msgIdsPath);
        }
        log.info("Creating Msg Id Dictionary");
        Job createMsgIdDictionary = prepareJob(input, msgIdsPath, SequenceFileInputFormat.class,
                MsgIdToDictionaryMapper.class, Text.class, VarIntWritable.class, MailToDictionaryReducer.class,
                Text.class, VarIntWritable.class, SequenceFileOutputFormat.class);

        boolean succeeded = createMsgIdDictionary.waitForCompletion(true);
        if (!succeeded) {
            return -1;
        }
        //write out the dictionary at the top level
        msgIdChunks = createDictionaryChunks(msgIdsPath, output, "msgIds-dictionary-",
                createMsgIdDictionary.getConfiguration(), chunkSize, msgDim);
    }
    //create the dictionary between from email addresses and longs
    List<Path> fromChunks = null;
    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Path fromIdsPath = new Path(output, "fromIds");
        if (overwrite) {
            HadoopUtil.delete(conf, fromIdsPath);
        }
        log.info("Creating From Id Dictionary");
        Job createFromIdDictionary = prepareJob(input, fromIdsPath, SequenceFileInputFormat.class,
                FromEmailToDictionaryMapper.class, Text.class, VarIntWritable.class,
                MailToDictionaryReducer.class, Text.class, VarIntWritable.class,
                SequenceFileOutputFormat.class);
        createFromIdDictionary.getConfiguration().set(EmailUtility.SEPARATOR, separator);
        boolean succeeded = createFromIdDictionary.waitForCompletion(true);
        if (!succeeded) {
            return -1;
        }
        //write out the dictionary at the top level
        int[] fromDim = new int[1];
        fromChunks = createDictionaryChunks(fromIdsPath, output, "fromIds-dictionary-",
                createFromIdDictionary.getConfiguration(), chunkSize, fromDim);
    }
    //OK, we have our dictionaries, let's output the real thing we need: <from_id -> <msgId, msgId, msgId, ...>>
    if (shouldRunNextPhase(parsedArgs, currentPhase) && fromChunks != null && msgIdChunks != null) {
        //Job map
        //may be a way to do this so that we can load the from ids in memory, if they are small enough so that
        // we don't need the double loop
        log.info("Creating recommendation matrix");
        Path vecPath = new Path(output, "recInput");
        if (overwrite) {
            HadoopUtil.delete(conf, vecPath);
        }
        //conf.set(EmailUtility.FROM_DIMENSION, String.valueOf(fromDim[0]));
        conf.set(EmailUtility.MSG_ID_DIMENSION, String.valueOf(msgDim[0]));
        conf.set(EmailUtility.FROM_PREFIX, "fromIds-dictionary-");
        conf.set(EmailUtility.MSG_IDS_PREFIX, "msgIds-dictionary-");
        conf.set(EmailUtility.FROM_INDEX, getOption("from"));
        conf.set(EmailUtility.REFS_INDEX, getOption("refs"));
        conf.set(EmailUtility.SEPARATOR, separator);
        conf.set(MailToRecReducer.USE_COUNTS_PREFERENCE, String.valueOf(useCounts));
        int j = 0;
        int i = 0;
        for (Path fromChunk : fromChunks) {
            for (Path idChunk : msgIdChunks) {
                Path out = new Path(vecPath, "tmp-" + i + '-' + j);
                DistributedCache.setCacheFiles(new URI[] { fromChunk.toUri(), idChunk.toUri() }, conf);
                Job createRecMatrix = prepareJob(input, out, SequenceFileInputFormat.class,
                        MailToRecMapper.class, Text.class, LongWritable.class, MailToRecReducer.class,
                        Text.class, NullWritable.class, TextOutputFormat.class);
                createRecMatrix.getConfiguration().set("mapred.output.compress", "false");
                boolean succeeded = createRecMatrix.waitForCompletion(true);
                if (!succeeded) {
                    return -1;
                }
                //copy the results up a level
                //HadoopUtil.copyMergeSeqFiles(out.getFileSystem(conf), out, vecPath.getFileSystem(conf), outPath, true,
                // conf, "");
                FileStatus[] fs = HadoopUtil.getFileStatus(new Path(out, "*"), PathType.GLOB,
                        PathFilters.partFilter(), null, conf);
                for (int k = 0; k < fs.length; k++) {
                    FileStatus f = fs[k];
                    Path outPath = new Path(vecPath, "chunk-" + i + '-' + j + '-' + k);
                    FileUtil.copy(f.getPath().getFileSystem(conf), f.getPath(), outPath.getFileSystem(conf),
                            outPath, true, overwrite, conf);
                }
                HadoopUtil.delete(conf, out);
                j++;
            }
            i++;
        }
        //concat the files together
        /*Path mergePath = new Path(output, "vectors.dat");
        if (overwrite) {
          HadoopUtil.delete(conf, mergePath);
        }
        log.info("Merging together output vectors to vectors.dat in {}", output);*/
        //HadoopUtil.copyMergeSeqFiles(vecPath.getFileSystem(conf), vecPath, mergePath.getFileSystem(conf), mergePath,
        // false, conf, "\n");
    }

    return 0;
}

From source file:org.apache.slider.common.tools.SliderUtils.java

License:Apache License

/**
 * Copy a directory to a new FS -both paths must be qualified. If
 * a directory needs to be created, supplied permissions can override
 * the default values. Existing directories are not touched
 * @param conf conf file//w  w  w.  j a v a 2 s. c o m
 * @param srcDirPath src dir
 * @param destDirPath dest dir
 * @param permission permission for the dest directory; null means "default"
 * @return # of files copies
 */
public static int copyDirectory(Configuration conf, Path srcDirPath, Path destDirPath, FsPermission permission)
        throws IOException, BadClusterStateException {
    FileSystem srcFS = FileSystem.get(srcDirPath.toUri(), conf);
    FileSystem destFS = FileSystem.get(destDirPath.toUri(), conf);
    //list all paths in the src.
    if (!srcFS.exists(srcDirPath)) {
        throw new FileNotFoundException("Source dir not found " + srcDirPath);
    }
    if (!srcFS.isDirectory(srcDirPath)) {
        throw new FileNotFoundException("Source dir not a directory " + srcDirPath);
    }
    GlobFilter dotFilter = new GlobFilter("[!.]*");
    FileStatus[] entries = srcFS.listStatus(srcDirPath, dotFilter);
    int srcFileCount = entries.length;
    if (srcFileCount == 0) {
        return 0;
    }
    if (permission == null) {
        permission = FsPermission.getDirDefault();
    }
    if (!destFS.exists(destDirPath)) {
        new SliderFileSystem(destFS, conf).createWithPermissions(destDirPath, permission);
    }
    Path[] sourcePaths = new Path[srcFileCount];
    for (int i = 0; i < srcFileCount; i++) {
        FileStatus e = entries[i];
        Path srcFile = e.getPath();
        if (srcFS.isDirectory(srcFile)) {
            String msg = "Configuration dir " + srcDirPath + " contains a directory " + srcFile;
            log.warn(msg);
            throw new IOException(msg);
        }
        log.debug("copying src conf file {}", srcFile);
        sourcePaths[i] = srcFile;
    }
    log.debug("Copying {} files from {} to dest {}", srcFileCount, srcDirPath, destDirPath);
    FileUtil.copy(srcFS, sourcePaths, destFS, destDirPath, false, true, conf);
    return srcFileCount;
}

From source file:org.apache.sysml.runtime.util.MapReduceTool.java

License:Apache License

public static void copyFileOnHDFS(String originalDir, String newDir) throws IOException {
    Path originalPath = new Path(originalDir);
    Path newPath = new Path(newDir);
    boolean deleteSource = false;
    boolean overwrite = true;

    JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
    FileSystem fs = IOUtilFunctions.getFileSystem(originalPath, job);
    if (fs.exists(originalPath)) {
        FileUtil.copy(fs, originalPath, fs, newPath, deleteSource, overwrite, job);
    }//  ww w.  j  a  va2s  .co  m
}

From source file:org.apache.sysml.yarn.DMLYarnClient.java

License:Apache License

@SuppressWarnings("deprecation")
private void copyResourcesToHdfsWorkingDir(YarnConfiguration yconf, String hdfsWD)
        throws ParseException, IOException, DMLRuntimeException, InterruptedException {
    Path confPath = new Path(hdfsWD, DML_CONFIG_NAME);
    FileSystem fs = IOUtilFunctions.getFileSystem(confPath, yconf);

    //create working directory
    MapReduceTool.createDirIfNotExistOnHDFS(confPath, DMLConfig.DEFAULT_SHARED_DIR_PERMISSION);

    //serialize the dml config to HDFS file 
    //NOTE: we do not modify and ship the absolute scratch space path of the current user
    //because this might result in permission issues if the app master is run with a different user
    //(runtime plan migration during resource reoptimizations now needs to use qualified names
    //for shipping/reading intermediates) TODO modify resource reoptimizer on prototype integration.
    try (FSDataOutputStream fout = fs.create(confPath, true)) {
        fout.writeBytes(_dmlConfig.serializeDMLConfig() + "\n");
    }//from  w w  w .ja  v  a2  s .  co m
    _hdfsDMLConfig = confPath.makeQualified(fs).toString();
    LOG.debug("DML config written to HDFS file: " + _hdfsDMLConfig + "");

    //serialize the dml script to HDFS file
    Path scriptPath = new Path(hdfsWD, DML_SCRIPT_NAME);
    try (FSDataOutputStream fout2 = fs.create(scriptPath, true)) {
        fout2.writeBytes(_dmlScript);
    }
    _hdfsDMLScript = scriptPath.makeQualified(fs).toString();
    LOG.debug("DML script written to HDFS file: " + _hdfsDMLScript + "");

    // copy local jar file to HDFS (try to get the original jar filename)
    String fname = getLocalJarFileNameFromEnvConst();
    if (fname == null) {
        //get location of unpacked jar classes and repackage (if required)
        String lclassFile = DMLYarnClient.class.getProtectionDomain().getCodeSource().getLocation().getPath()
                .toString();
        File flclassFile = new File(lclassFile);
        if (!flclassFile.isDirectory()) //called w/ jar 
            fname = lclassFile;
        else //called w/ unpacked jar (need to be repackaged)   
            fname = createJar(lclassFile);
    }
    Path srcPath = new Path(fname);
    Path dstPath = new Path(hdfsWD, srcPath.getName());
    FileUtil.copy(FileSystem.getLocal(yconf), srcPath, fs, dstPath, false, true, yconf);
    _hdfsJarFile = dstPath.makeQualified(fs).toString();
    LOG.debug(
            "Jar file copied from local file: " + srcPath.toString() + " to HDFS file: " + dstPath.toString());
}

From source file:org.dutir.lucene.io.HadoopUtility.java

License:Mozilla Public License

protected static void saveApplicationSetupToJob(JobConf jobConf, boolean getFreshProperties) throws Exception {
    // Do we load a fresh properties File?
    //TODO fix, if necessary
    //if (getFreshProperties)
    //   loadApplicationSetup(new Path(ApplicationSetup.TERRIER_HOME));

    FileSystem remoteFS = FileSystem.get(jobConf);
    URI remoteFSURI = remoteFS.getUri();
    //make a copy of the current application setup properties, these may be amended
    //as some files are more globally accessible
    final Properties propertiesDuringJob = new Properties();
    Properties appProperties = ApplicationSetup.getProperties();
    for (Object _key : appProperties.keySet()) {
        String key = (String) _key;
        propertiesDuringJob.put(key, appProperties.get(key));
    }//from w w  w .  j  a  v a2  s .  c  o  m

    //the share folder is needed during indexing, save this on DFS
    if (Files.getFileSystemName(ApplicationSetup.LUCENE_SHARE).equals("local")) {
        Path tempTRShare = makeTemporaryFile(jobConf, "terrier.share");
        propertiesDuringJob.setProperty("terrier.share", remoteFSURI.resolve(tempTRShare.toUri()).toString());
        logger.info("Copying terrier share/ directory to shared storage area ("
                + remoteFSURI.resolve(tempTRShare.toUri()).toString() + ")");
        FileUtil.copy(FileSystem.getLocal(jobConf), new Path(ApplicationSetup.LUCENE_SHARE), remoteFS,
                tempTRShare, false, false, jobConf);
    }

    //copy the terrier.properties content over
    Path tempTRProperties = makeTemporaryFile(jobConf, "terrier.properties");
    logger.debug("Writing terrier properties out to DFS " + tempTRProperties.toString());
    OutputStream out = remoteFS.create(tempTRProperties);
    remoteFS.deleteOnExit(tempTRProperties);
    propertiesDuringJob.store(out, "Automatically generated by HadoopPlugin.saveApplicationSetupToJob()");
    out.close();
    out = null;
    DistributedCache.addCacheFile(tempTRProperties.toUri().resolve(new URI("#terrier.properties")), jobConf);
    DistributedCache.createSymlink(jobConf);

    //copy the non-JVM system properties over as well
    Path tempSysProperties = makeTemporaryFile(jobConf, "system.properties");
    DataOutputStream dos = FileSystem.get(jobConf).create(tempSysProperties);
    logger.debug("Writing system properties out to DFS " + tempSysProperties.toString());
    for (Object _propertyKey : System.getProperties().keySet()) {
        String propertyKey = (String) _propertyKey;
        if (!startsWithAny(propertyKey, checkSystemProperties)) {
            dos.writeUTF(propertyKey);
            dos.writeUTF(System.getProperty(propertyKey));
        }
    }
    dos.writeUTF("FIN");
    dos.close();
    dos = null;
    DistributedCache.addCacheFile(tempSysProperties.toUri().resolve(new URI("#system.properties")), jobConf);
}

From source file:org.mrgeo.cmd.updatesplitfile.UpdateSplitFile.java

License:Apache License

@Override
public int run(final String[] args, final Configuration conf, final Properties providerProperties) {
    try {/*from  w  w w.j  ava2  s .  c  o m*/
        final CommandLineParser parser = new PosixParser();
        final CommandLine line = parser.parse(new Options(), args);

        final String splitFile = line.getArgs()[0];
        final FileSystem fs = HadoopFileUtils.getFileSystem(conf);

        final Path splitFilePath = new Path(splitFile);

        final Path tmp = new Path(HadoopFileUtils.getTempDir(), HadoopUtils.createRandomString(4));

        FileUtil.copy(fs, splitFilePath, fs, tmp, false, true, conf);

        final SplitFile sf = new SplitFile(conf);
        sf.copySplitFile(tmp.toString(), splitFilePath.getParent().toString(), true);

        return 0;
    } catch (final Exception e) {
        e.printStackTrace();
    }

    return -1;
}

From source file:org.mrgeo.hdfs.tile.SplitFile.java

License:Apache License

public void copySplitFile(final String splitFileFrom, final String splitFileToDir, final int[] partitionsUsed,
        final boolean deleteSource) throws IOException {
    // move split file into the output directory
    if (!HadoopUtils.isLocal(conf)) {
        final Path splitFileTo = new Path(splitFileToDir, SPLIT_FILE);
        final FileSystem fsTo = splitFileTo.getFileSystem(conf);
        Path splitFileFromPath = new Path(splitFileFrom);
        final FileSystem fsFrom = splitFileFromPath.getFileSystem(conf);
        if (fsFrom.exists(splitFileFromPath)) {
            final FileStatus status = fsFrom.getFileStatus(splitFileFromPath);

            // if the splits file is empty, no need to copy it...
            if (status.getLen() > 0) {
                // if we have partition names already, just copy the file...
                if (hasPartitionNames(splitFileFrom)) {
                    FileUtil.copy(fsFrom, splitFileFromPath, fsTo, splitFileTo, deleteSource, true, conf);
                } else {
                    // no partitions in the split file, make one...
                    fsTo.delete(splitFileTo, false);

                    String[] partitions = findPartitions(splitFileToDir);
                    List<Long> splits = readSplits(splitFileFrom);

                    if ((splits.size() + 1) > partitions.length) {

                        if (partitionsUsed != null) {
                            final List<Long> tmpSplits = new ArrayList<Long>();

                            // make sure the array is sorted...
                            Arrays.sort(partitionsUsed);
                            for (final int used : partitionsUsed) {
                                if (used < splits.size()) {
                                    tmpSplits.add(splits.get(used));
                                }/*www . j  a  va2 s.c om*/
                            }

                            splits = tmpSplits;
                        }
                    } else if ((splits.size() + 1) < partitions.length) {
                        if (log.isDebugEnabled()) {
                            log.debug("original splits:");
                            for (Long split : splits) {
                                log.debug("  " + split);
                            }

                            log.debug("partitions found:");
                            for (String part : partitions) {
                                log.debug("  " + part);
                                FileStatus st = fsTo.getFileStatus(new Path(splitFileToDir, part + "/index"));
                                log.debug("  index size: " + st.getLen());
                                FileStatus st2 = fsTo.getFileStatus(new Path(splitFileToDir, part + "/data"));
                                log.debug("  data size: " + st2.getLen());
                            }

                            List<String> tmpPartitions = new ArrayList<String>();

                            for (String part : partitions) {

                                MapFile.Reader reader = null;

                                try {
                                    reader = new MapFile.Reader(fsTo,
                                            (new Path(splitFileToDir, part)).toString(), conf);

                                    TileIdWritable key = new TileIdWritable();
                                    RasterWritable val = new RasterWritable();
                                    if (reader.next(key, val)) {
                                        tmpPartitions.add(part);
                                    }
                                } finally {
                                    if (reader != null) {
                                        reader.close();
                                    }
                                }
                            }

                            log.debug("partitions having records:");
                            for (String p : tmpPartitions) {
                                log.debug("  " + p);
                            }
                        }
                    }

                    if (splits.size() + 1 != partitions.length) {
                        throw new IOException(
                                "splits file and file partitions mismatch (splits should be 1 less than partitions)!  Splits length: "
                                        + splits.size() + " number of partitions: " + partitions.length);
                    }

                    writeSplits(splits, partitions, splitFileTo.toString());

                    if (deleteSource) {
                        fsFrom.delete(splitFileFromPath, false);
                    }
                }
            } else {
                fsFrom.delete(splitFileFromPath, false);
            }
        }
    }
}

From source file:org.mrgeo.mapalgebra.VectorReaderMapOp.java

License:Apache License

@Override
public void moveOutput(String toName) throws IOException {
    if (dp != null) {
        // Do the move through the data provider
        dp.move(toName);//w w w  . ja  va  2  s .co m
    } else {
        // If there is no vector data provider, fall-back to the existing HDFS
        // code.
        Path toPath = new Path(toName);
        Path sourcePath = new Path(_outputName);
        Configuration conf = createConfiguration();
        FileSystem sourceFs = HadoopFileUtils.getFileSystem(conf, sourcePath);
        FileSystem destFs = HadoopFileUtils.getFileSystem(conf, toPath);
        if (!FileUtil.copy(sourceFs, sourcePath, destFs, toPath, false, false, conf)) {
            throw new IOException("Error copying '" + _outputName + "' to '" + toName.toString() + "'");
        }
        // Now copy the 
        Path sourceColumns = new Path(_outputName + ".columns");
        if (sourceFs.exists(sourceColumns)) {
            Path toColumns = new Path(toName.toString() + ".columns");
            if (FileUtil.copy(sourceFs, sourceColumns, destFs, toColumns, false, false, conf) == false) {
                throw new IOException("Error copying columns file '" + sourceColumns.toString() + "' to '"
                        + toColumns.toString());
            }
        }
    }
    _outputName = toName;
    _output = new BasicInputFormatDescriptor(_outputName);
}