List of usage examples for org.apache.hadoop.fs FileUtil copy
public static boolean copy(FileSystem srcFS, FileStatus srcStatus, FileSystem dstFS, Path dst, boolean deleteSource, boolean overwrite, Configuration conf) throws IOException
From source file:org.apache.kylin.dict.global.GlobalDictHDFSStore.java
License:Apache License
@Override void prepareForWrite(String workingDir) throws IOException { // TODO create lock file Path working = new Path(workingDir); if (fileSystem.exists(working)) { fileSystem.delete(working, true); logger.info("Working directory {} exits, delete it first", working); }// w w w .j a v a 2s. com // when build dict, copy all data into working dir and work on it, avoiding suddenly server crash made data corrupt Long[] versions = listAllVersions(); if (versions.length > 0) { Path latestVersion = getVersionDir(versions[versions.length - 1]); FileUtil.copy(fileSystem, latestVersion, fileSystem, working, false, true, conf); } else { fileSystem.mkdirs(working); } }
From source file:org.apache.kylin.dict.global.GlobalDictHDFSStore.java
License:Apache License
@Override public String copyToAnotherMeta(KylinConfig srcConfig, KylinConfig dstConfig) throws IOException { checkArgument(baseDir.startsWith(srcConfig.getHdfsWorkingDirectory()), "Please check why current directory {} doesn't belong to source working directory {}", baseDir, srcConfig.getHdfsWorkingDirectory()); final String dstBaseDir = baseDir.replaceFirst(srcConfig.getHdfsWorkingDirectory(), dstConfig.getHdfsWorkingDirectory()); Long[] versions = listAllVersions(); if (versions.length == 0) { // empty dict, nothing to copy return dstBaseDir; }// w w w .j av a2 s . c o m Path srcVersionDir = getVersionDir(versions[versions.length - 1]); Path dstVersionDir = new Path(srcVersionDir.toString().replaceFirst(srcConfig.getHdfsWorkingDirectory(), dstConfig.getHdfsWorkingDirectory())); FileSystem dstFS = dstVersionDir.getFileSystem(conf); if (dstFS.exists(dstVersionDir)) { dstFS.delete(dstVersionDir, true); } FileUtil.copy(fileSystem, srcVersionDir, dstFS, dstVersionDir, false, true, conf); return dstBaseDir; }
From source file:org.apache.mahout.cf.taste.example.email.MailToPrefsDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { addInputOption();/* ww w . java 2s . com*/ addOutputOption(); addOption(DefaultOptionCreator.overwriteOption().create()); addOption("chunkSize", "cs", "The size of chunks to write. Default is 100 mb", "100"); addOption("separator", "sep", "The separator used in the input file to separate to, from, subject. Default is \\n", "\n"); addOption("from", "f", "The position in the input text (value) where the from email is located, starting from " + "zero (0).", "0"); addOption("refs", "r", "The position in the input text (value) where the reference ids are located, " + "starting from zero (0).", "1"); addOption(buildOption("useCounts", "u", "If set, then use the number of times the user has interacted with a " + "thread as an indication of their preference. Otherwise, use boolean preferences.", false, false, String.valueOf(true))); Map<String, List<String>> parsedArgs = parseArguments(args); Path input = getInputPath(); Path output = getOutputPath(); int chunkSize = Integer.parseInt(getOption("chunkSize")); String separator = getOption("separator"); Configuration conf = getConf(); boolean useCounts = hasOption("useCounts"); AtomicInteger currentPhase = new AtomicInteger(); int[] msgDim = new int[1]; //TODO: mod this to not do so many passes over the data. Dictionary creation could probably be a chain mapper List<Path> msgIdChunks = null; boolean overwrite = hasOption(DefaultOptionCreator.OVERWRITE_OPTION); // create the dictionary between message ids and longs if (shouldRunNextPhase(parsedArgs, currentPhase)) { //TODO: there seems to be a pattern emerging for dictionary creation // -- sparse vectors from seq files also has this. Path msgIdsPath = new Path(output, "msgIds"); if (overwrite) { HadoopUtil.delete(conf, msgIdsPath); } log.info("Creating Msg Id Dictionary"); Job createMsgIdDictionary = prepareJob(input, msgIdsPath, SequenceFileInputFormat.class, MsgIdToDictionaryMapper.class, Text.class, VarIntWritable.class, MailToDictionaryReducer.class, Text.class, VarIntWritable.class, SequenceFileOutputFormat.class); boolean succeeded = createMsgIdDictionary.waitForCompletion(true); if (!succeeded) { return -1; } //write out the dictionary at the top level msgIdChunks = createDictionaryChunks(msgIdsPath, output, "msgIds-dictionary-", createMsgIdDictionary.getConfiguration(), chunkSize, msgDim); } //create the dictionary between from email addresses and longs List<Path> fromChunks = null; if (shouldRunNextPhase(parsedArgs, currentPhase)) { Path fromIdsPath = new Path(output, "fromIds"); if (overwrite) { HadoopUtil.delete(conf, fromIdsPath); } log.info("Creating From Id Dictionary"); Job createFromIdDictionary = prepareJob(input, fromIdsPath, SequenceFileInputFormat.class, FromEmailToDictionaryMapper.class, Text.class, VarIntWritable.class, MailToDictionaryReducer.class, Text.class, VarIntWritable.class, SequenceFileOutputFormat.class); createFromIdDictionary.getConfiguration().set(EmailUtility.SEPARATOR, separator); boolean succeeded = createFromIdDictionary.waitForCompletion(true); if (!succeeded) { return -1; } //write out the dictionary at the top level int[] fromDim = new int[1]; fromChunks = createDictionaryChunks(fromIdsPath, output, "fromIds-dictionary-", createFromIdDictionary.getConfiguration(), chunkSize, fromDim); } //OK, we have our dictionaries, let's output the real thing we need: <from_id -> <msgId, msgId, msgId, ...>> if (shouldRunNextPhase(parsedArgs, currentPhase) && fromChunks != null && msgIdChunks != null) { //Job map //may be a way to do this so that we can load the from ids in memory, if they are small enough so that // we don't need the double loop log.info("Creating recommendation matrix"); Path vecPath = new Path(output, "recInput"); if (overwrite) { HadoopUtil.delete(conf, vecPath); } //conf.set(EmailUtility.FROM_DIMENSION, String.valueOf(fromDim[0])); conf.set(EmailUtility.MSG_ID_DIMENSION, String.valueOf(msgDim[0])); conf.set(EmailUtility.FROM_PREFIX, "fromIds-dictionary-"); conf.set(EmailUtility.MSG_IDS_PREFIX, "msgIds-dictionary-"); conf.set(EmailUtility.FROM_INDEX, getOption("from")); conf.set(EmailUtility.REFS_INDEX, getOption("refs")); conf.set(EmailUtility.SEPARATOR, separator); conf.set(MailToRecReducer.USE_COUNTS_PREFERENCE, String.valueOf(useCounts)); int j = 0; int i = 0; for (Path fromChunk : fromChunks) { for (Path idChunk : msgIdChunks) { Path out = new Path(vecPath, "tmp-" + i + '-' + j); DistributedCache.setCacheFiles(new URI[] { fromChunk.toUri(), idChunk.toUri() }, conf); Job createRecMatrix = prepareJob(input, out, SequenceFileInputFormat.class, MailToRecMapper.class, Text.class, LongWritable.class, MailToRecReducer.class, Text.class, NullWritable.class, TextOutputFormat.class); createRecMatrix.getConfiguration().set("mapred.output.compress", "false"); boolean succeeded = createRecMatrix.waitForCompletion(true); if (!succeeded) { return -1; } //copy the results up a level //HadoopUtil.copyMergeSeqFiles(out.getFileSystem(conf), out, vecPath.getFileSystem(conf), outPath, true, // conf, ""); FileStatus[] fs = HadoopUtil.getFileStatus(new Path(out, "*"), PathType.GLOB, PathFilters.partFilter(), null, conf); for (int k = 0; k < fs.length; k++) { FileStatus f = fs[k]; Path outPath = new Path(vecPath, "chunk-" + i + '-' + j + '-' + k); FileUtil.copy(f.getPath().getFileSystem(conf), f.getPath(), outPath.getFileSystem(conf), outPath, true, overwrite, conf); } HadoopUtil.delete(conf, out); j++; } i++; } //concat the files together /*Path mergePath = new Path(output, "vectors.dat"); if (overwrite) { HadoopUtil.delete(conf, mergePath); } log.info("Merging together output vectors to vectors.dat in {}", output);*/ //HadoopUtil.copyMergeSeqFiles(vecPath.getFileSystem(conf), vecPath, mergePath.getFileSystem(conf), mergePath, // false, conf, "\n"); } return 0; }
From source file:org.apache.slider.common.tools.SliderUtils.java
License:Apache License
/** * Copy a directory to a new FS -both paths must be qualified. If * a directory needs to be created, supplied permissions can override * the default values. Existing directories are not touched * @param conf conf file//w w w. j a v a 2 s. c o m * @param srcDirPath src dir * @param destDirPath dest dir * @param permission permission for the dest directory; null means "default" * @return # of files copies */ public static int copyDirectory(Configuration conf, Path srcDirPath, Path destDirPath, FsPermission permission) throws IOException, BadClusterStateException { FileSystem srcFS = FileSystem.get(srcDirPath.toUri(), conf); FileSystem destFS = FileSystem.get(destDirPath.toUri(), conf); //list all paths in the src. if (!srcFS.exists(srcDirPath)) { throw new FileNotFoundException("Source dir not found " + srcDirPath); } if (!srcFS.isDirectory(srcDirPath)) { throw new FileNotFoundException("Source dir not a directory " + srcDirPath); } GlobFilter dotFilter = new GlobFilter("[!.]*"); FileStatus[] entries = srcFS.listStatus(srcDirPath, dotFilter); int srcFileCount = entries.length; if (srcFileCount == 0) { return 0; } if (permission == null) { permission = FsPermission.getDirDefault(); } if (!destFS.exists(destDirPath)) { new SliderFileSystem(destFS, conf).createWithPermissions(destDirPath, permission); } Path[] sourcePaths = new Path[srcFileCount]; for (int i = 0; i < srcFileCount; i++) { FileStatus e = entries[i]; Path srcFile = e.getPath(); if (srcFS.isDirectory(srcFile)) { String msg = "Configuration dir " + srcDirPath + " contains a directory " + srcFile; log.warn(msg); throw new IOException(msg); } log.debug("copying src conf file {}", srcFile); sourcePaths[i] = srcFile; } log.debug("Copying {} files from {} to dest {}", srcFileCount, srcDirPath, destDirPath); FileUtil.copy(srcFS, sourcePaths, destFS, destDirPath, false, true, conf); return srcFileCount; }
From source file:org.apache.sysml.runtime.util.MapReduceTool.java
License:Apache License
public static void copyFileOnHDFS(String originalDir, String newDir) throws IOException { Path originalPath = new Path(originalDir); Path newPath = new Path(newDir); boolean deleteSource = false; boolean overwrite = true; JobConf job = new JobConf(ConfigurationManager.getCachedJobConf()); FileSystem fs = IOUtilFunctions.getFileSystem(originalPath, job); if (fs.exists(originalPath)) { FileUtil.copy(fs, originalPath, fs, newPath, deleteSource, overwrite, job); }// ww w. j a va2s .co m }
From source file:org.apache.sysml.yarn.DMLYarnClient.java
License:Apache License
@SuppressWarnings("deprecation") private void copyResourcesToHdfsWorkingDir(YarnConfiguration yconf, String hdfsWD) throws ParseException, IOException, DMLRuntimeException, InterruptedException { Path confPath = new Path(hdfsWD, DML_CONFIG_NAME); FileSystem fs = IOUtilFunctions.getFileSystem(confPath, yconf); //create working directory MapReduceTool.createDirIfNotExistOnHDFS(confPath, DMLConfig.DEFAULT_SHARED_DIR_PERMISSION); //serialize the dml config to HDFS file //NOTE: we do not modify and ship the absolute scratch space path of the current user //because this might result in permission issues if the app master is run with a different user //(runtime plan migration during resource reoptimizations now needs to use qualified names //for shipping/reading intermediates) TODO modify resource reoptimizer on prototype integration. try (FSDataOutputStream fout = fs.create(confPath, true)) { fout.writeBytes(_dmlConfig.serializeDMLConfig() + "\n"); }//from w w w .ja v a2 s . co m _hdfsDMLConfig = confPath.makeQualified(fs).toString(); LOG.debug("DML config written to HDFS file: " + _hdfsDMLConfig + ""); //serialize the dml script to HDFS file Path scriptPath = new Path(hdfsWD, DML_SCRIPT_NAME); try (FSDataOutputStream fout2 = fs.create(scriptPath, true)) { fout2.writeBytes(_dmlScript); } _hdfsDMLScript = scriptPath.makeQualified(fs).toString(); LOG.debug("DML script written to HDFS file: " + _hdfsDMLScript + ""); // copy local jar file to HDFS (try to get the original jar filename) String fname = getLocalJarFileNameFromEnvConst(); if (fname == null) { //get location of unpacked jar classes and repackage (if required) String lclassFile = DMLYarnClient.class.getProtectionDomain().getCodeSource().getLocation().getPath() .toString(); File flclassFile = new File(lclassFile); if (!flclassFile.isDirectory()) //called w/ jar fname = lclassFile; else //called w/ unpacked jar (need to be repackaged) fname = createJar(lclassFile); } Path srcPath = new Path(fname); Path dstPath = new Path(hdfsWD, srcPath.getName()); FileUtil.copy(FileSystem.getLocal(yconf), srcPath, fs, dstPath, false, true, yconf); _hdfsJarFile = dstPath.makeQualified(fs).toString(); LOG.debug( "Jar file copied from local file: " + srcPath.toString() + " to HDFS file: " + dstPath.toString()); }
From source file:org.dutir.lucene.io.HadoopUtility.java
License:Mozilla Public License
protected static void saveApplicationSetupToJob(JobConf jobConf, boolean getFreshProperties) throws Exception { // Do we load a fresh properties File? //TODO fix, if necessary //if (getFreshProperties) // loadApplicationSetup(new Path(ApplicationSetup.TERRIER_HOME)); FileSystem remoteFS = FileSystem.get(jobConf); URI remoteFSURI = remoteFS.getUri(); //make a copy of the current application setup properties, these may be amended //as some files are more globally accessible final Properties propertiesDuringJob = new Properties(); Properties appProperties = ApplicationSetup.getProperties(); for (Object _key : appProperties.keySet()) { String key = (String) _key; propertiesDuringJob.put(key, appProperties.get(key)); }//from w w w . j a v a2 s . c o m //the share folder is needed during indexing, save this on DFS if (Files.getFileSystemName(ApplicationSetup.LUCENE_SHARE).equals("local")) { Path tempTRShare = makeTemporaryFile(jobConf, "terrier.share"); propertiesDuringJob.setProperty("terrier.share", remoteFSURI.resolve(tempTRShare.toUri()).toString()); logger.info("Copying terrier share/ directory to shared storage area (" + remoteFSURI.resolve(tempTRShare.toUri()).toString() + ")"); FileUtil.copy(FileSystem.getLocal(jobConf), new Path(ApplicationSetup.LUCENE_SHARE), remoteFS, tempTRShare, false, false, jobConf); } //copy the terrier.properties content over Path tempTRProperties = makeTemporaryFile(jobConf, "terrier.properties"); logger.debug("Writing terrier properties out to DFS " + tempTRProperties.toString()); OutputStream out = remoteFS.create(tempTRProperties); remoteFS.deleteOnExit(tempTRProperties); propertiesDuringJob.store(out, "Automatically generated by HadoopPlugin.saveApplicationSetupToJob()"); out.close(); out = null; DistributedCache.addCacheFile(tempTRProperties.toUri().resolve(new URI("#terrier.properties")), jobConf); DistributedCache.createSymlink(jobConf); //copy the non-JVM system properties over as well Path tempSysProperties = makeTemporaryFile(jobConf, "system.properties"); DataOutputStream dos = FileSystem.get(jobConf).create(tempSysProperties); logger.debug("Writing system properties out to DFS " + tempSysProperties.toString()); for (Object _propertyKey : System.getProperties().keySet()) { String propertyKey = (String) _propertyKey; if (!startsWithAny(propertyKey, checkSystemProperties)) { dos.writeUTF(propertyKey); dos.writeUTF(System.getProperty(propertyKey)); } } dos.writeUTF("FIN"); dos.close(); dos = null; DistributedCache.addCacheFile(tempSysProperties.toUri().resolve(new URI("#system.properties")), jobConf); }
From source file:org.mrgeo.cmd.updatesplitfile.UpdateSplitFile.java
License:Apache License
@Override public int run(final String[] args, final Configuration conf, final Properties providerProperties) { try {/*from w w w.j ava2 s . c o m*/ final CommandLineParser parser = new PosixParser(); final CommandLine line = parser.parse(new Options(), args); final String splitFile = line.getArgs()[0]; final FileSystem fs = HadoopFileUtils.getFileSystem(conf); final Path splitFilePath = new Path(splitFile); final Path tmp = new Path(HadoopFileUtils.getTempDir(), HadoopUtils.createRandomString(4)); FileUtil.copy(fs, splitFilePath, fs, tmp, false, true, conf); final SplitFile sf = new SplitFile(conf); sf.copySplitFile(tmp.toString(), splitFilePath.getParent().toString(), true); return 0; } catch (final Exception e) { e.printStackTrace(); } return -1; }
From source file:org.mrgeo.hdfs.tile.SplitFile.java
License:Apache License
public void copySplitFile(final String splitFileFrom, final String splitFileToDir, final int[] partitionsUsed, final boolean deleteSource) throws IOException { // move split file into the output directory if (!HadoopUtils.isLocal(conf)) { final Path splitFileTo = new Path(splitFileToDir, SPLIT_FILE); final FileSystem fsTo = splitFileTo.getFileSystem(conf); Path splitFileFromPath = new Path(splitFileFrom); final FileSystem fsFrom = splitFileFromPath.getFileSystem(conf); if (fsFrom.exists(splitFileFromPath)) { final FileStatus status = fsFrom.getFileStatus(splitFileFromPath); // if the splits file is empty, no need to copy it... if (status.getLen() > 0) { // if we have partition names already, just copy the file... if (hasPartitionNames(splitFileFrom)) { FileUtil.copy(fsFrom, splitFileFromPath, fsTo, splitFileTo, deleteSource, true, conf); } else { // no partitions in the split file, make one... fsTo.delete(splitFileTo, false); String[] partitions = findPartitions(splitFileToDir); List<Long> splits = readSplits(splitFileFrom); if ((splits.size() + 1) > partitions.length) { if (partitionsUsed != null) { final List<Long> tmpSplits = new ArrayList<Long>(); // make sure the array is sorted... Arrays.sort(partitionsUsed); for (final int used : partitionsUsed) { if (used < splits.size()) { tmpSplits.add(splits.get(used)); }/*www . j a va2 s.c om*/ } splits = tmpSplits; } } else if ((splits.size() + 1) < partitions.length) { if (log.isDebugEnabled()) { log.debug("original splits:"); for (Long split : splits) { log.debug(" " + split); } log.debug("partitions found:"); for (String part : partitions) { log.debug(" " + part); FileStatus st = fsTo.getFileStatus(new Path(splitFileToDir, part + "/index")); log.debug(" index size: " + st.getLen()); FileStatus st2 = fsTo.getFileStatus(new Path(splitFileToDir, part + "/data")); log.debug(" data size: " + st2.getLen()); } List<String> tmpPartitions = new ArrayList<String>(); for (String part : partitions) { MapFile.Reader reader = null; try { reader = new MapFile.Reader(fsTo, (new Path(splitFileToDir, part)).toString(), conf); TileIdWritable key = new TileIdWritable(); RasterWritable val = new RasterWritable(); if (reader.next(key, val)) { tmpPartitions.add(part); } } finally { if (reader != null) { reader.close(); } } } log.debug("partitions having records:"); for (String p : tmpPartitions) { log.debug(" " + p); } } } if (splits.size() + 1 != partitions.length) { throw new IOException( "splits file and file partitions mismatch (splits should be 1 less than partitions)! Splits length: " + splits.size() + " number of partitions: " + partitions.length); } writeSplits(splits, partitions, splitFileTo.toString()); if (deleteSource) { fsFrom.delete(splitFileFromPath, false); } } } else { fsFrom.delete(splitFileFromPath, false); } } } }
From source file:org.mrgeo.mapalgebra.VectorReaderMapOp.java
License:Apache License
@Override public void moveOutput(String toName) throws IOException { if (dp != null) { // Do the move through the data provider dp.move(toName);//w w w . ja va 2 s .co m } else { // If there is no vector data provider, fall-back to the existing HDFS // code. Path toPath = new Path(toName); Path sourcePath = new Path(_outputName); Configuration conf = createConfiguration(); FileSystem sourceFs = HadoopFileUtils.getFileSystem(conf, sourcePath); FileSystem destFs = HadoopFileUtils.getFileSystem(conf, toPath); if (!FileUtil.copy(sourceFs, sourcePath, destFs, toPath, false, false, conf)) { throw new IOException("Error copying '" + _outputName + "' to '" + toName.toString() + "'"); } // Now copy the Path sourceColumns = new Path(_outputName + ".columns"); if (sourceFs.exists(sourceColumns)) { Path toColumns = new Path(toName.toString() + ".columns"); if (FileUtil.copy(sourceFs, sourceColumns, destFs, toColumns, false, false, conf) == false) { throw new IOException("Error copying columns file '" + sourceColumns.toString() + "' to '" + toColumns.toString()); } } } _outputName = toName; _output = new BasicInputFormatDescriptor(_outputName); }