List of usage examples for org.apache.hadoop.fs FileSystem rename
public abstract boolean rename(Path src, Path dst) throws IOException;
From source file:org.apache.mahout.math.hadoop.stochasticsvd.SSVDCli.java
License:Apache License
@Override public int run(String[] args) throws Exception { addInputOption();//from w w w . j a v a 2s. c om addOutputOption(); addOption("rank", "k", "decomposition rank", true); addOption("oversampling", "p", "oversampling", String.valueOf(15)); addOption("blockHeight", "r", "Y block height (must be > (k+p))", String.valueOf(10000)); addOption("outerProdBlockHeight", "oh", "block height of outer products during multiplication, increase for sparse inputs", String.valueOf(30000)); addOption("abtBlockHeight", "abth", "block height of Y_i in ABtJob during AB' multiplication, increase for extremely sparse inputs", String.valueOf(200000)); addOption("minSplitSize", "s", "minimum split size", String.valueOf(-1)); addOption("computeU", "U", "compute U (true/false)", String.valueOf(true)); addOption("uHalfSigma", "uhs", "Compute U * Sigma^0.5", String.valueOf(false)); addOption("uSigma", "us", "Compute U * Sigma", String.valueOf(false)); addOption("computeV", "V", "compute V (true/false)", String.valueOf(true)); addOption("vHalfSigma", "vhs", "compute V * Sigma^0.5", String.valueOf(false)); addOption("reduceTasks", "t", "number of reduce tasks (where applicable)", true); addOption("powerIter", "q", "number of additional power iterations (0..2 is good)", String.valueOf(0)); addOption("broadcast", "br", "whether use distributed cache to broadcast matrices wherever possible", String.valueOf(true)); addOption("pca", "pca", "run in pca mode: compute column-wise mean and subtract from input", String.valueOf(false)); addOption("pcaOffset", "xi", "path(glob) of external pca mean (optional, dont compute, use external mean"); addOption(DefaultOptionCreator.overwriteOption().create()); Map<String, List<String>> pargs = parseArguments(args); if (pargs == null) { return -1; } int k = Integer.parseInt(getOption("rank")); int p = Integer.parseInt(getOption("oversampling")); int r = Integer.parseInt(getOption("blockHeight")); int h = Integer.parseInt(getOption("outerProdBlockHeight")); int abh = Integer.parseInt(getOption("abtBlockHeight")); int q = Integer.parseInt(getOption("powerIter")); int minSplitSize = Integer.parseInt(getOption("minSplitSize")); boolean computeU = Boolean.parseBoolean(getOption("computeU")); boolean computeV = Boolean.parseBoolean(getOption("computeV")); boolean cUHalfSigma = Boolean.parseBoolean(getOption("uHalfSigma")); boolean cUSigma = Boolean.parseBoolean(getOption("uSigma")); boolean cVHalfSigma = Boolean.parseBoolean(getOption("vHalfSigma")); int reduceTasks = Integer.parseInt(getOption("reduceTasks")); boolean broadcast = Boolean.parseBoolean(getOption("broadcast")); String xiPathStr = getOption("pcaOffset"); Path xiPath = xiPathStr == null ? null : new Path(xiPathStr); boolean pca = Boolean.parseBoolean(getOption("pca")) || xiPath != null; boolean overwrite = hasOption(DefaultOptionCreator.OVERWRITE_OPTION); Configuration conf = getConf(); if (conf == null) { throw new IOException("No Hadoop configuration present"); } Path[] inputPaths = { getInputPath() }; Path tempPath = getTempPath(); FileSystem fs = FileSystem.get(getTempPath().toUri(), conf); // housekeeping if (overwrite) { // clear the output path HadoopUtil.delete(getConf(), getOutputPath()); // clear the temp path HadoopUtil.delete(getConf(), getTempPath()); } fs.mkdirs(getOutputPath()); // MAHOUT-817 if (pca && xiPath == null) { xiPath = new Path(tempPath, "xi"); if (overwrite) { fs.delete(xiPath, true); } MatrixColumnMeansJob.run(conf, inputPaths[0], xiPath); } SSVDSolver solver = new SSVDSolver(conf, inputPaths, new Path(tempPath, "ssvd"), r, k, p, reduceTasks); solver.setMinSplitSize(minSplitSize); solver.setComputeU(computeU); solver.setComputeV(computeV); solver.setcUHalfSigma(cUHalfSigma); solver.setcVHalfSigma(cVHalfSigma); solver.setcUSigma(cUSigma); solver.setOuterBlockHeight(h); solver.setAbtBlockHeight(abh); solver.setQ(q); solver.setBroadcast(broadcast); solver.setOverwrite(overwrite); if (xiPath != null) { solver.setPcaMeanPath(new Path(xiPath, "part-*")); } solver.run(); Vector svalues = solver.getSingularValues().viewPart(0, k); SSVDHelper.saveVector(svalues, getOutputPath("sigma"), conf); if (computeU && !fs.rename(new Path(solver.getUPath()), getOutputPath())) { throw new IOException("Unable to move U results to the output path."); } if (cUHalfSigma && !fs.rename(new Path(solver.getuHalfSigmaPath()), getOutputPath())) { throw new IOException("Unable to move U*Sigma^0.5 results to the output path."); } if (cUSigma && !fs.rename(new Path(solver.getuSigmaPath()), getOutputPath())) { throw new IOException("Unable to move U*Sigma results to the output path."); } if (computeV && !fs.rename(new Path(solver.getVPath()), getOutputPath())) { throw new IOException("Unable to move V results to the output path."); } if (cVHalfSigma && !fs.rename(new Path(solver.getvHalfSigmaPath()), getOutputPath())) { throw new IOException("Unable to move V*Sigma^0.5 results to the output path."); } // Delete the temp path on exit fs.deleteOnExit(getTempPath()); return 0; }
From source file:org.apache.mahout.utils.vectors.text.DictionaryVectorizer.java
License:Apache License
/** * Create Term Frequency (Tf) Vectors from the input set of documents in {@link SequenceFile} format. This * tries to fix the maximum memory used by the feature chunk per node thereby splitting the process across * multiple map/reduces./* w ww . j a v a2 s . c o m*/ * * @param input * input directory of the documents in {@link SequenceFile} format * @param output * output directory where {@link org.apache.mahout.math.RandomAccessSparseVector}'s of the document * are generated * @param minSupport * the minimum frequency of the feature in the entire corpus to be considered for inclusion in the * sparse vector * @param maxNGramSize * 1 = unigram, 2 = unigram and bigram, 3 = unigram, bigram and trigram * @param minLLRValue * minValue of log likelihood ratio to used to prune ngrams * @param chunkSizeInMegabytes * the size in MB of the feature => id chunk to be kept in memory at each node during Map/Reduce * stage. Its recommended you calculated this based on the number of cores and the free memory * available to you per node. Say, you have 2 cores and around 1GB extra memory to spare we * recommend you use a split size of around 400-500MB so that two simultaneous reducers can create * partial vectors without thrashing the system due to increased swapping * @throws IOException * @throws ClassNotFoundException * @throws InterruptedException */ public static void createTermFrequencyVectors(Path input, Path output, Configuration baseConf, int minSupport, int maxNGramSize, float minLLRValue, int numReducers, int chunkSizeInMegabytes, boolean sequentialAccess) throws IOException, InterruptedException, ClassNotFoundException { if (chunkSizeInMegabytes < MIN_CHUNKSIZE) { chunkSizeInMegabytes = MIN_CHUNKSIZE; } else if (chunkSizeInMegabytes > MAX_CHUNKSIZE) { // 10GB chunkSizeInMegabytes = MAX_CHUNKSIZE; } if (minSupport < 0) { minSupport = DEFAULT_MIN_SUPPORT; } Path dictionaryJobPath = new Path(output, DICTIONARY_JOB_FOLDER); int[] maxTermDimension = new int[1]; List<Path> dictionaryChunks; if (maxNGramSize == 1) { startWordCounting(input, dictionaryJobPath, minSupport); dictionaryChunks = createDictionaryChunks(minSupport, dictionaryJobPath, output, chunkSizeInMegabytes, new LongWritable(), maxTermDimension); } else { CollocDriver.generateAllGrams(input, dictionaryJobPath, baseConf, maxNGramSize, minSupport, minLLRValue, numReducers); dictionaryChunks = createDictionaryChunks(minSupport, new Path(new Path(output, DICTIONARY_JOB_FOLDER), CollocDriver.NGRAM_OUTPUT_DIRECTORY), output, chunkSizeInMegabytes, new DoubleWritable(), maxTermDimension); } int partialVectorIndex = 0; List<Path> partialVectorPaths = new ArrayList<Path>(); for (Path dictionaryChunk : dictionaryChunks) { Path partialVectorOutputPath = new Path(output, VECTOR_OUTPUT_FOLDER + partialVectorIndex++); partialVectorPaths.add(partialVectorOutputPath); makePartialVectors(input, maxNGramSize, dictionaryChunk, partialVectorOutputPath, maxTermDimension[0], sequentialAccess, numReducers); } Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(partialVectorPaths.get(0).toUri(), conf); Path outputDir = new Path(output, DOCUMENT_VECTOR_OUTPUT_FOLDER); if (dictionaryChunks.size() > 1) { PartialVectorMerger.mergePartialVectors(partialVectorPaths, outputDir, -1, maxTermDimension[0], sequentialAccess, numReducers); HadoopUtil.deletePaths(partialVectorPaths, fs); } else { Path singlePartialVectorOutputPath = partialVectorPaths.get(0); fs.delete(outputDir, true); fs.rename(singlePartialVectorOutputPath, outputDir); } }
From source file:org.apache.mahout.utils.vectors.tfidf.TFIDFConverter.java
License:Apache License
/** * Create Term Frequency-Inverse Document Frequency (Tf-Idf) Vectors from the input set of vectors in * {@link SequenceFile} format. This job uses a fixed limit on the maximum memory used by the feature chunk * per node thereby splitting the process across multiple map/reduces. * //w w w .j a v a 2s . c om * @param input * input directory of the vectors in {@link SequenceFile} format * @param output * output directory where {@link org.apache.mahout.math.RandomAccessSparseVector}'s of the document * are generated * @param chunkSizeInMegabytes * the size in MB of the feature => id chunk to be kept in memory at each node during Map/Reduce * stage. Its recommended you calculated this based on the number of cores and the free memory * available to you per node. Say, you have 2 cores and around 1GB extra memory to spare we * recommend you use a split size of around 400-500MB so that two simultaneous reducers can create * partial vectors without thrashing the system due to increased swapping * @param minDf * The minimum document frequency. Default 1 * @param maxDFPercent * The max percentage of vectors for the DF. Can be used to remove really high frequency features. * Expressed as an integer between 0 and 100. Default 99 * @param numReducers * The number of reducers to spawn. This also affects the possible parallelism since each reducer * will typically produce a single output file containing tf-idf vectors for a subset of the * documents in the corpus. * @throws IOException * @throws ClassNotFoundException * @throws InterruptedException */ public static void processTfIdf(Path input, Path output, int chunkSizeInMegabytes, int minDf, int maxDFPercent, float normPower, boolean sequentialAccessOutput, int numReducers) throws IOException, InterruptedException, ClassNotFoundException { if (chunkSizeInMegabytes < MIN_CHUNKSIZE) { chunkSizeInMegabytes = MIN_CHUNKSIZE; } else if (chunkSizeInMegabytes > MAX_CHUNKSIZE) { // 10GB chunkSizeInMegabytes = MAX_CHUNKSIZE; } if (normPower != PartialVectorMerger.NO_NORMALIZING && normPower < 0) { throw new IllegalArgumentException("normPower must either be -1 or >= 0"); } if (minDf < 1) { minDf = 1; } if (maxDFPercent < 0 || maxDFPercent > 100) { maxDFPercent = 99; } Path wordCountPath = new Path(output, WORDCOUNT_OUTPUT_FOLDER); startDFCounting(input, wordCountPath); Pair<Long[], List<Path>> datasetFeatures = createDictionaryChunks(wordCountPath, output, chunkSizeInMegabytes); int partialVectorIndex = 0; List<Path> partialVectorPaths = new ArrayList<Path>(); List<Path> dictionaryChunks = datasetFeatures.getSecond(); for (Path dictionaryChunk : dictionaryChunks) { Path partialVectorOutputPath = new Path(output, VECTOR_OUTPUT_FOLDER + partialVectorIndex++); partialVectorPaths.add(partialVectorOutputPath); makePartialVectors(input, datasetFeatures.getFirst()[0], datasetFeatures.getFirst()[1], minDf, maxDFPercent, dictionaryChunk, partialVectorOutputPath, sequentialAccessOutput); } Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(partialVectorPaths.get(0).toUri(), conf); Path outputDir = new Path(output, DOCUMENT_VECTOR_OUTPUT_FOLDER); if (dictionaryChunks.size() > 1) { PartialVectorMerger.mergePartialVectors(partialVectorPaths, outputDir, normPower, datasetFeatures.getFirst()[0].intValue(), sequentialAccessOutput, numReducers); HadoopUtil.deletePaths(partialVectorPaths, fs); } else { Path singlePartialVectorOutputPath = partialVectorPaths.get(0); fs.delete(outputDir, true); fs.rename(singlePartialVectorOutputPath, outputDir); } }
From source file:org.apache.metron.writer.hdfs.SourceAwareMoveAction.java
License:Apache License
@Override public void execute(FileSystem fileSystem, Path filePath) throws IOException { Path destPath = new Path(new Path(destination, getSource(filePath)), filePath.getName()); LOG.info("Moving file " + filePath + " to " + destPath); boolean success = fileSystem.rename(filePath, destPath); return;/*from w w w . j a va 2s . com*/ }
From source file:org.apache.nifi.processors.hadoop.AbstractPutHDFSRecord.java
License:Apache License
/** * Attempts to rename srcFile to destFile up to 10 times, with a 200ms sleep in between each attempt. * * If the file has not been renamed after 10 attempts, a FailureException is thrown. * * @param fileSystem the file system where the files are located * @param srcFile the source file/*from w ww . j a v a 2s . c o m*/ * @param destFile the destination file to rename the source to * @throws IOException if IOException happens while attempting to rename * @throws InterruptedException if renaming is interrupted * @throws FailureException if the file couldn't be renamed after 10 attempts */ protected void rename(final FileSystem fileSystem, final Path srcFile, final Path destFile) throws IOException, InterruptedException, FailureException { boolean renamed = false; for (int i = 0; i < 10; i++) { // try to rename multiple times. if (fileSystem.rename(srcFile, destFile)) { renamed = true; break;// rename was successful } Thread.sleep(200L);// try waiting to let whatever might cause rename failure to resolve } if (!renamed) { fileSystem.delete(srcFile, false); throw new FailureException("Could not rename file " + srcFile + " to its final filename"); } }
From source file:org.apache.nifi.processors.hadoop.MoveHDFS.java
License:Apache License
protected void processBatchOfFiles(final List<Path> files, final ProcessContext context, final ProcessSession session, FlowFile parentFlowFile) { Preconditions.checkState(parentFlowFile != null, "No parent flowfile for this batch was provided"); // process the batch of files final Configuration conf = getConfiguration(); final FileSystem hdfs = getFileSystem(); final UserGroupInformation ugi = getUserGroupInformation(); if (conf == null || ugi == null) { getLogger().error("Configuration or UserGroupInformation not configured properly"); session.transfer(parentFlowFile, REL_FAILURE); context.yield();/*w w w .j a v a2s .co m*/ return; } for (final Path file : files) { ugi.doAs(new PrivilegedAction<Object>() { @Override public Object run() { FlowFile flowFile = session.create(parentFlowFile); try { final String originalFilename = file.getName(); final Path configuredRootOutputDirPath = processorConfig.getOutputDirectory(); final Path newFile = new Path(configuredRootOutputDirPath, originalFilename); final boolean destinationExists = hdfs.exists(newFile); // If destination file already exists, resolve that // based on processor configuration if (destinationExists) { switch (processorConfig.getConflictResolution()) { case REPLACE_RESOLUTION: if (hdfs.delete(file, false)) { getLogger().info("deleted {} in order to replace with the contents of {}", new Object[] { file, flowFile }); } break; case IGNORE_RESOLUTION: session.transfer(flowFile, REL_SUCCESS); getLogger().info( "transferring {} to success because file with same name already exists", new Object[] { flowFile }); return null; case FAIL_RESOLUTION: session.transfer(session.penalize(flowFile), REL_FAILURE); getLogger().warn( "penalizing {} and routing to failure because file with same name already exists", new Object[] { flowFile }); return null; default: break; } } // Create destination directory if it does not exist try { if (!hdfs.getFileStatus(configuredRootOutputDirPath).isDirectory()) { throw new IOException(configuredRootOutputDirPath.toString() + " already exists and is not a directory"); } } catch (FileNotFoundException fe) { if (!hdfs.mkdirs(configuredRootOutputDirPath)) { throw new IOException( configuredRootOutputDirPath.toString() + " could not be created"); } changeOwner(context, hdfs, configuredRootOutputDirPath); } boolean moved = false; for (int i = 0; i < 10; i++) { // try to rename multiple // times. if (processorConfig.getOperation().equals("move")) { if (hdfs.rename(file, newFile)) { moved = true; break;// rename was successful } } else { if (FileUtil.copy(hdfs, file, hdfs, newFile, false, conf)) { moved = true; break;// copy was successful } } Thread.sleep(200L);// try waiting to let whatever might cause rename failure to resolve } if (!moved) { throw new ProcessException("Could not move file " + file + " to its final filename"); } changeOwner(context, hdfs, newFile); final String outputPath = newFile.toString(); final String newFilename = newFile.getName(); final String hdfsPath = newFile.getParent().toString(); flowFile = session.putAttribute(flowFile, CoreAttributes.FILENAME.key(), newFilename); flowFile = session.putAttribute(flowFile, ABSOLUTE_HDFS_PATH_ATTRIBUTE, hdfsPath); final String transitUri = (outputPath.startsWith("/")) ? "hdfs:/" + outputPath : "hdfs://" + outputPath; session.getProvenanceReporter().send(flowFile, transitUri); session.transfer(flowFile, REL_SUCCESS); } catch (final Throwable t) { getLogger().error("Failed to rename on HDFS due to {}", new Object[] { t }); session.transfer(session.penalize(flowFile), REL_FAILURE); context.yield(); } return null; } }); } }
From source file:org.apache.nifi.processors.hadoop.PutHDFS.java
License:Apache License
@Override public void onTrigger(ProcessContext context, ProcessSession session) throws ProcessException { final FlowFile flowFile = session.get(); if (flowFile == null) { return;//w w w . j a v a 2 s .c o m } final FileSystem hdfs = getFileSystem(); final Configuration configuration = getConfiguration(); final UserGroupInformation ugi = getUserGroupInformation(); if (configuration == null || hdfs == null || ugi == null) { getLogger().error("HDFS not configured properly"); session.transfer(flowFile, REL_FAILURE); context.yield(); return; } ugi.doAs(new PrivilegedAction<Object>() { @Override public Object run() { Path tempDotCopyFile = null; FlowFile putFlowFile = flowFile; try { final String dirValue = context.getProperty(DIRECTORY).evaluateAttributeExpressions(putFlowFile) .getValue(); final Path configuredRootDirPath = new Path(dirValue); final String conflictResponse = context.getProperty(CONFLICT_RESOLUTION).getValue(); final Double blockSizeProp = context.getProperty(BLOCK_SIZE).asDataSize(DataUnit.B); final long blockSize = blockSizeProp != null ? blockSizeProp.longValue() : hdfs.getDefaultBlockSize(configuredRootDirPath); final Double bufferSizeProp = context.getProperty(BUFFER_SIZE).asDataSize(DataUnit.B); final int bufferSize = bufferSizeProp != null ? bufferSizeProp.intValue() : configuration.getInt(BUFFER_SIZE_KEY, BUFFER_SIZE_DEFAULT); final Integer replicationProp = context.getProperty(REPLICATION_FACTOR).asInteger(); final short replication = replicationProp != null ? replicationProp.shortValue() : hdfs.getDefaultReplication(configuredRootDirPath); final CompressionCodec codec = getCompressionCodec(context, configuration); final String filename = codec != null ? putFlowFile.getAttribute(CoreAttributes.FILENAME.key()) + codec.getDefaultExtension() : putFlowFile.getAttribute(CoreAttributes.FILENAME.key()); final Path tempCopyFile = new Path(configuredRootDirPath, "." + filename); final Path copyFile = new Path(configuredRootDirPath, filename); // Create destination directory if it does not exist try { if (!hdfs.getFileStatus(configuredRootDirPath).isDirectory()) { throw new IOException( configuredRootDirPath.toString() + " already exists and is not a directory"); } } catch (FileNotFoundException fe) { if (!hdfs.mkdirs(configuredRootDirPath)) { throw new IOException(configuredRootDirPath.toString() + " could not be created"); } changeOwner(context, hdfs, configuredRootDirPath, flowFile); } final boolean destinationExists = hdfs.exists(copyFile); // If destination file already exists, resolve that based on processor configuration if (destinationExists) { switch (conflictResponse) { case REPLACE_RESOLUTION: if (hdfs.delete(copyFile, false)) { getLogger().info("deleted {} in order to replace with the contents of {}", new Object[] { copyFile, putFlowFile }); } break; case IGNORE_RESOLUTION: session.transfer(putFlowFile, REL_SUCCESS); getLogger().info( "transferring {} to success because file with same name already exists", new Object[] { putFlowFile }); return null; case FAIL_RESOLUTION: session.transfer(session.penalize(putFlowFile), REL_FAILURE); getLogger().warn( "penalizing {} and routing to failure because file with same name already exists", new Object[] { putFlowFile }); return null; default: break; } } // Write FlowFile to temp file on HDFS final StopWatch stopWatch = new StopWatch(true); session.read(putFlowFile, new InputStreamCallback() { @Override public void process(InputStream in) throws IOException { OutputStream fos = null; Path createdFile = null; try { if (conflictResponse.equals(APPEND_RESOLUTION_AV.getValue()) && destinationExists) { fos = hdfs.append(copyFile, bufferSize); } else { fos = hdfs.create(tempCopyFile, true, bufferSize, replication, blockSize); } if (codec != null) { fos = codec.createOutputStream(fos); } createdFile = tempCopyFile; BufferedInputStream bis = new BufferedInputStream(in); StreamUtils.copy(bis, fos); bis = null; fos.flush(); } finally { try { if (fos != null) { fos.close(); } } catch (RemoteException re) { // when talking to remote HDFS clusters, we don't notice problems until fos.close() if (createdFile != null) { try { hdfs.delete(createdFile, false); } catch (Throwable ignore) { } } throw re; } catch (Throwable ignore) { } fos = null; } } }); stopWatch.stop(); final String dataRate = stopWatch.calculateDataRate(putFlowFile.getSize()); final long millis = stopWatch.getDuration(TimeUnit.MILLISECONDS); tempDotCopyFile = tempCopyFile; if (!conflictResponse.equals(APPEND_RESOLUTION_AV.getValue()) || (conflictResponse.equals(APPEND_RESOLUTION_AV.getValue()) && !destinationExists)) { boolean renamed = false; for (int i = 0; i < 10; i++) { // try to rename multiple times. if (hdfs.rename(tempCopyFile, copyFile)) { renamed = true; break;// rename was successful } Thread.sleep(200L);// try waiting to let whatever might cause rename failure to resolve } if (!renamed) { hdfs.delete(tempCopyFile, false); throw new ProcessException("Copied file to HDFS but could not rename dot file " + tempCopyFile + " to its final filename"); } changeOwner(context, hdfs, copyFile, flowFile); } getLogger().info("copied {} to HDFS at {} in {} milliseconds at a rate of {}", new Object[] { putFlowFile, copyFile, millis, dataRate }); final String newFilename = copyFile.getName(); final String hdfsPath = copyFile.getParent().toString(); putFlowFile = session.putAttribute(putFlowFile, CoreAttributes.FILENAME.key(), newFilename); putFlowFile = session.putAttribute(putFlowFile, ABSOLUTE_HDFS_PATH_ATTRIBUTE, hdfsPath); final Path qualifiedPath = copyFile.makeQualified(hdfs.getUri(), hdfs.getWorkingDirectory()); session.getProvenanceReporter().send(putFlowFile, qualifiedPath.toString()); session.transfer(putFlowFile, REL_SUCCESS); } catch (final Throwable t) { if (tempDotCopyFile != null) { try { hdfs.delete(tempDotCopyFile, false); } catch (Exception e) { getLogger().error("Unable to remove temporary file {} due to {}", new Object[] { tempDotCopyFile, e }); } } getLogger().error("Failed to write to HDFS due to {}", new Object[] { t }); session.transfer(session.penalize(putFlowFile), REL_FAILURE); context.yield(); } return null; } }); }
From source file:org.apache.nutch.admin.index.IndexThread.java
License:Apache License
public void run() { try {/*from www. j av a 2 s .c o m*/ // this.fMessage = "index.running"; FileSystem fs = FileSystem.get(this.conf); if (fs.exists(this.indexDir)) { Path backup_path = new Path(this.conf.get("searcher.dir"), "index_backup_" + new Date().getTime()); System.out.println("Backing up " + this.indexDir + " as " + backup_path + "..."); fs.rename(this.indexDir, backup_path); System.out.println("Done."); } System.out.println("Re-indexing in " + this.indexDir.toString()); this.indexer.index(this.indexDir, this.tableName); } catch (IOException e) { LOG.warning(e.toString()); this.fMessage = e.toString(); } }
From source file:org.apache.nutch.crawl.CrawlDbMerger.java
License:Apache License
public void merge(Path output, Path[] dbs, boolean normalize, boolean filter) throws Exception { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("CrawlDb merge: starting at " + sdf.format(start)); JobConf job = createMergeJob(getConf(), output, normalize, filter); for (int i = 0; i < dbs.length; i++) { if (LOG.isInfoEnabled()) { LOG.info("Adding " + dbs[i]); }// w w w . jav a 2s.c o m FileInputFormat.addInputPath(job, new Path(dbs[i], CrawlDb.CURRENT_NAME)); } JobClient.runJob(job); FileSystem fs = FileSystem.get(getConf()); fs.mkdirs(output); fs.rename(FileOutputFormat.getOutputPath(job), new Path(output, CrawlDb.CURRENT_NAME)); long end = System.currentTimeMillis(); LOG.info("CrawlDb merge: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
From source file:org.apache.nutch.crawl.LinkDb.java
License:Apache License
public static void install(JobConf job, Path linkDb) throws IOException { Path newLinkDb = FileOutputFormat.getOutputPath(job); FileSystem fs = new JobClient(job).getFs(); Path old = new Path(linkDb, "old"); Path current = new Path(linkDb, CURRENT_NAME); if (fs.exists(current)) { if (fs.exists(old)) fs.delete(old, true);//from w ww . j a va 2s . co m fs.rename(current, old); } fs.mkdirs(linkDb); fs.rename(newLinkDb, current); if (fs.exists(old)) fs.delete(old, true); LockUtil.removeLockFile(fs, new Path(linkDb, LOCK_NAME)); }