List of usage examples for org.apache.hadoop.fs FileSystem startLocalOutput
public Path startLocalOutput(Path fsOutputFile, Path tmpLocalFile) throws IOException
From source file:com.splout.db.hadoop.engine.SploutSQLProxyOutputFormat.java
License:Apache License
@Override public RecordWriter<ITuple, NullWritable> getRecordWriter(TaskAttemptContext context) throws IOException, InterruptedException { long waitTimeHeartBeater = context.getConfiguration().getLong(HeartBeater.WAIT_TIME_CONF, 5000); heartBeater = new HeartBeater(context, waitTimeHeartBeater); heartBeater.needHeartBeat();/* w ww. j a v a2s. c om*/ conf = context.getConfiguration(); this.context = context; outputFormat.setConf(context.getConfiguration()); return new RecordWriter<ITuple, NullWritable>() { // Temporary and permanent Paths for properly writing Hadoop output files private Map<Integer, Path> permPool = new HashMap<Integer, Path>(); private Map<Integer, Path> tempPool = new HashMap<Integer, Path>(); private void initSql(int partition) throws IOException, InterruptedException { // HDFS final location of the generated partition file. It will be // loaded to the temporary folder in the HDFS than finally will be // committed by the OutputCommitter to the proper location. FileOutputCommitter committer = (FileOutputCommitter) getOutputCommitter( SploutSQLProxyOutputFormat.this.context); Path perm = new Path(committer.getWorkPath(), partition + ".db"); FileSystem fs = perm.getFileSystem(conf); // Make a task unique name that contains the actual index output name to // make debugging simpler // Note: if using JVM reuse, the sequence number will not be reset for a // new task using the jvm Path temp = conf.getLocalPath("mapred.local.dir", "splout_task_" + SploutSQLProxyOutputFormat.this.context.getTaskAttemptID() + '.' + FILE_SEQUENCE.incrementAndGet()); FileSystem localFileSystem = FileSystem.getLocal(conf); if (localFileSystem.exists(temp)) { localFileSystem.delete(temp, true); } localFileSystem.mkdirs(temp); Path local = fs.startLocalOutput(perm, new Path(temp, partition + ".db")); // permPool.put(partition, perm); tempPool.put(partition, new Path(temp, partition + ".db")); outputFormat.initPartition(partition, local); } @Override public void close(TaskAttemptContext ctx) throws IOException, InterruptedException { FileSystem fs = FileSystem.get(ctx.getConfiguration()); try { if (ctx != null) { heartBeater.setProgress(ctx); } outputFormat.close(); for (Map.Entry<Integer, Path> entry : permPool.entrySet()) { // Hadoop - completeLocalOutput() fs.completeLocalOutput(entry.getValue(), tempPool.get(entry.getKey())); } } finally { // in any case, destroy the HeartBeater heartBeater.cancelHeartBeat(); } } @Override public void write(ITuple tuple, NullWritable ignore) throws IOException, InterruptedException { int partition = (Integer) tuple.get(SploutSQLOutputFormat.PARTITION_TUPLE_FIELD); if (tempPool.get(partition) == null) { initSql(partition); } outputFormat.write(tuple); } }; }
From source file:org.apache.nutch.indexer.IndexMerger.java
License:Apache License
/** * Merge all input indexes to the single output index *///from ww w . j a va 2 s . c o m public void merge(Path[] indexes, Path outputIndex, Path localWorkingDir) throws IOException { if (LOG.isInfoEnabled()) { LOG.info("merging indexes to: " + outputIndex); } FileSystem localFs = FileSystem.getLocal(getConf()); if (localWorkingDir == null) { localWorkingDir = new Path("indexmerger-" + System.currentTimeMillis()); } if (localFs.exists(localWorkingDir)) { localFs.delete(localWorkingDir); } localFs.mkdirs(localWorkingDir); // Get local output target // FileSystem fs = FileSystem.get(getConf()); Path tmpLocalOutput = new Path(localWorkingDir, "merge-output"); Path localOutput = fs.startLocalOutput(outputIndex, tmpLocalOutput); Directory[] dirs = new Directory[indexes.length]; for (int i = 0; i < indexes.length; i++) { if (LOG.isInfoEnabled()) { LOG.info("Adding " + indexes[i]); } dirs[i] = new FsDirectory(fs, indexes[i], false, this.conf); } // // // Merge indices // IndexWriter writer = new IndexWriter(localOutput.toString(), null, true); writer.setMergeFactor(conf.getInt("indexer.mergeFactor", IndexWriter.DEFAULT_MERGE_FACTOR)); writer.setMaxBufferedDocs(conf.getInt("indexer.minMergeDocs", IndexWriter.DEFAULT_MAX_BUFFERED_DOCS)); writer.setMaxMergeDocs(conf.getInt("indexer.maxMergeDocs", IndexWriter.DEFAULT_MAX_MERGE_DOCS)); writer.setTermIndexInterval( conf.getInt("indexer.termIndexInterval", IndexWriter.DEFAULT_TERM_INDEX_INTERVAL)); writer.setInfoStream(LogUtil.getDebugStream(LOG)); writer.setUseCompoundFile(false); writer.setSimilarity(new NutchSimilarity()); writer.addIndexes(dirs); writer.close(); // // Put target back // fs.completeLocalOutput(outputIndex, tmpLocalOutput); FileSystem.getLocal(conf).delete(localWorkingDir); if (LOG.isInfoEnabled()) { LOG.info("done merging"); } }
From source file:org.archive.jbs.lucene.LuceneOutputFormat.java
License:Apache License
public RecordWriter<Text, Text> getRecordWriter(final FileSystem fs, final JobConf job, final String name, final Progressable progress) throws IOException { // Open Lucene index in ${temp} this.fs = FileSystem.get(job); this.job = job; this.perm = new Path(FileOutputFormat.getOutputPath(job), name); this.temp = job.getLocalPath("index/_" + (new Random().nextInt())); this.fs.delete(perm, true); // delete old, if any indexer = new IndexWriter(new NIOFSDirectory(new File(fs.startLocalOutput(perm, temp).toString())), new KeywordAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED); indexer.setMergeFactor(job.getInt("jbs.lucene.mergeFactor", 100)); indexer.setMaxMergeDocs(job.getInt("jbs.lucene.maxMergeDocs", Integer.MAX_VALUE)); indexer.setRAMBufferSizeMB(job.getInt("jbs.lucene.maxRAMBufferSize", 512)); indexer.setTermIndexInterval(//from w w w. j a v a2s. co m job.getInt("jbs.lucene.termIndexInterval", IndexWriterConfig.DEFAULT_TERM_INDEX_INTERVAL)); indexer.setMaxFieldLength(job.getInt("jbs.lucene.max.tokens", Integer.MAX_VALUE)); indexer.setUseCompoundFile(false); indexer.setSimilarity(new WebSimilarity()); LuceneDocumentWriter docWriter = buildDocumentWriter(job, indexer); return new LuceneRecordWriter(docWriter); }
From source file:org.archive.nutchwax.IndexMerger.java
License:Apache License
/** * Merge all input indexes to the single output index *//* ww w . j av a 2s . c o m*/ public void merge(IndexReader[] readers, Path outputIndex, Path localWorkingDir, boolean parallel) throws IOException { LOG.info("merging indexes to: " + outputIndex); FileSystem localFs = FileSystem.getLocal(getConf()); if (localFs.exists(localWorkingDir)) { localFs.delete(localWorkingDir, true); } localFs.mkdirs(localWorkingDir); // Get local output target // FileSystem fs = FileSystem.get(getConf()); if (fs.exists(outputIndex)) { throw new FileAlreadyExistsException("Output directory " + outputIndex + " already exists!"); } Path tmpLocalOutput = new Path(localWorkingDir, "merge-output"); Path localOutput = fs.startLocalOutput(outputIndex, tmpLocalOutput); // // Merge indices // IndexWriter writer = new IndexWriter(localOutput.toString(), null, true); writer.setMergeFactor(getConf().getInt("indexer.mergeFactor", IndexWriter.DEFAULT_MERGE_FACTOR)); writer.setMaxBufferedDocs(getConf().getInt("indexer.minMergeDocs", IndexWriter.DEFAULT_MAX_BUFFERED_DOCS)); writer.setMaxMergeDocs(getConf().getInt("indexer.maxMergeDocs", IndexWriter.DEFAULT_MAX_MERGE_DOCS)); writer.setTermIndexInterval( getConf().getInt("indexer.termIndexInterval", IndexWriter.DEFAULT_TERM_INDEX_INTERVAL)); writer.setInfoStream(LogUtil.getDebugStream(LOG)); writer.setUseCompoundFile(false); writer.setSimilarity(new NutchSimilarity()); writer.addIndexes(readers); writer.close(); // // Put target back // fs.completeLocalOutput(outputIndex, tmpLocalOutput); LOG.info("done merging"); }
From source file:org.hbasene.index.create.mapred.IndexOutputFormat.java
License:Apache License
@Override public RecordWriter<ImmutableBytesWritable, LuceneDocumentWrapper> getRecordWriter(final FileSystem fs, JobConf job, String name, final Progressable progress) throws IOException { final Path perm = new Path(FileOutputFormat.getOutputPath(job), name); final Path temp = job.getLocalPath("index/_" + Integer.toString(random.nextInt())); LOG.info("To index into " + perm); // delete old, if any fs.delete(perm, true);//from ww w.j av a2 s.co m final IndexConfiguration indexConf = new IndexConfiguration(); String content = job.get("hbase.index.conf"); if (content != null) { indexConf.addFromXML(content); } String analyzerName = indexConf.getAnalyzerName(); Analyzer analyzer; try { Class<? extends Analyzer> analyzerClass = Class.forName(analyzerName).asSubclass(Analyzer.class); Constructor<? extends Analyzer> analyzerCtor = analyzerClass.getConstructor(Version.class); analyzer = analyzerCtor.newInstance(Version.LUCENE_30); } catch (Exception e) { throw new IOException("Error in creating an analyzer object " + analyzerName); } // build locally first final IndexWriter writer = new IndexWriter( FSDirectory.open(new File(fs.startLocalOutput(perm, temp).toString())), analyzer, true, IndexWriter.MaxFieldLength.LIMITED); // no delete, so no need for maxBufferedDeleteTerms writer.setMaxBufferedDocs(indexConf.getMaxBufferedDocs()); writer.setMaxFieldLength(indexConf.getMaxFieldLength()); writer.setMaxMergeDocs(indexConf.getMaxMergeDocs()); writer.setMergeFactor(indexConf.getMergeFactor()); String similarityName = indexConf.getSimilarityName(); if (similarityName != null) { try { Class<? extends Similarity> similarityClass = Class.forName(similarityName) .asSubclass(Similarity.class); Constructor<? extends Similarity> ctor = similarityClass.getConstructor(Version.class); Similarity similarity = ctor.newInstance(Version.LUCENE_30); writer.setSimilarity(similarity); } catch (Exception e) { throw new IOException("Error in creating a similarity object " + similarityName); } } writer.setUseCompoundFile(indexConf.isUseCompoundFile()); return new RecordWriter<ImmutableBytesWritable, LuceneDocumentWrapper>() { AtomicBoolean closed = new AtomicBoolean(false); private long docCount = 0; public void write(ImmutableBytesWritable key, LuceneDocumentWrapper value) throws IOException { // unwrap and index doc Document doc = value.get(); writer.addDocument(doc); docCount++; progress.progress(); } public void close(final Reporter reporter) throws IOException { // spawn a thread to give progress heartbeats Thread prog = new Thread() { @Override public void run() { while (!closed.get()) { try { reporter.setStatus("closing"); Thread.sleep(1000); } catch (InterruptedException e) { continue; } catch (Throwable e) { return; } } } }; try { prog.start(); // optimize index if (indexConf.doOptimize()) { if (LOG.isInfoEnabled()) { LOG.info("Optimizing index."); } writer.optimize(); } // close index writer.close(); if (LOG.isInfoEnabled()) { LOG.info("Done indexing " + docCount + " docs."); } // copy to perm destination in dfs fs.completeLocalOutput(perm, temp); if (LOG.isInfoEnabled()) { LOG.info("Copy done."); } } finally { closed.set(true); } } }; }