Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package cmd; import static com.hp.hpl.jena.sparql.util.Utils.nowAsString; import java.io.File; import java.io.FileFilter; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.util.Arrays; import java.util.Map; import java.util.TreeMap; import java.util.zip.GZIPInputStream; import java.util.zip.GZIPOutputStream; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.jena.tdbloader4.Constants; import org.apache.jena.tdbloader4.NodeTableRewriter; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.hp.hpl.jena.tdb.base.file.Location; import com.hp.hpl.jena.tdb.store.DatasetGraphTDB; import com.hp.hpl.jena.tdb.store.bulkloader.BulkLoader; import com.hp.hpl.jena.tdb.store.bulkloader2.CmdIndexBuild; import com.hp.hpl.jena.tdb.store.bulkloader2.ProgressLogger; import com.hp.hpl.jena.tdb.sys.Names; import com.hp.hpl.jena.tdb.sys.SetupTDB; public class download extends Configured implements Tool { private static final Logger log = LoggerFactory.getLogger(download.class); public download() { super(); log.debug("constructed with no configuration."); } public download(Configuration configuration) { super(configuration); log.debug("constructed with configuration."); } @Override public int run(String[] args) throws Exception { if (args.length != 3) { System.err.printf("Usage: %s [generic options] <input node table> <input b+tree indexes> <output>\n", getClass().getName()); ToolRunner.printGenericCommandUsage(System.err); return -1; } Configuration configuration = getConf(); Location location = new Location(args[2]); DatasetGraphTDB dsgDisk = SetupTDB.buildDataset(location); dsgDisk.sync(); dsgDisk.close(); FileSystem fs = FileSystem.get(configuration); // Node table new File(args[1], "nodes.dat").delete(); mergeToLocalFile(fs, new Path(args[0]), args[2], configuration); // TODO: this is a sort of a cheat and it could go away (if it turns out to be too slow)! fixNodeTable2(location); // B+Tree indexes mergeToLocalFile2(fs, new Path(args[1]), args[2], configuration); return 0; } private void mergeToLocalFile(FileSystem fs, Path src, String outPath, Configuration configuration) throws FileNotFoundException, IOException { FileStatus[] status = fs.listStatus(src); Map<String, Path> paths = new TreeMap<String, Path>(); for (FileStatus fileStatus : status) { Path path = fileStatus.getPath(); String pathName = path.getName(); if (pathName.startsWith(Constants.NAME_SECOND)) { paths.put(pathName, path); } } File outFile = new File(outPath, Names.indexId2Node + ".dat"); OutputStream out = new FileOutputStream(outFile); for (String pathName : paths.keySet()) { Path path = new Path(src, paths.get(pathName)); log.debug("Concatenating {} into {}...", path.toUri(), outFile.getAbsoluteFile()); InputStream in = fs.open(new Path(path, Names.indexId2Node + ".dat")); IOUtils.copyBytes(in, out, configuration, false); in.close(); } out.close(); } private void mergeToLocalFile2(FileSystem fs, Path src, String outPath, Configuration configuration) throws FileNotFoundException, IOException { // Find all the right paths and copy .gz files locally FileStatus[] status = fs.listStatus(src); Map<String, Path> paths = new TreeMap<String, Path>(); for (FileStatus fileStatus : status) { Path path = fileStatus.getPath(); String pathName = path.getName(); if (pathName.startsWith(Constants.NAME_FOURTH)) { paths.put(pathName, path); } } for (String pathName : paths.keySet()) { Path path = new Path(src, paths.get(pathName)); status = fs.listStatus(path); for (FileStatus fileStatus : status) { Path p = fileStatus.getPath(); log.debug("Copying {} to {}...", p.toUri(), outPath); fs.copyToLocalFile(p, new Path(outPath, p.getName())); } } // Merge .gz files into indexName.gz File fileOutputPath = new File(outPath); File[] files = fileOutputPath.listFiles(new FileFilter() { @Override public boolean accept(File pathname) { return pathname.getName().endsWith(".gz"); } }); Arrays.sort(files); String prevIndexName = null; OutputStream out = null; for (File file : files) { log.debug("Processing {}... ", file.getName()); String indexName = file.getName().substring(0, file.getName().indexOf("_")); if (prevIndexName == null) prevIndexName = indexName; if (out == null) out = new GZIPOutputStream(new FileOutputStream(new File(outPath, indexName + ".gz"))); if (!prevIndexName.equals(indexName)) { if (out != null) out.close(); log.debug("Index name set to {}", indexName); out = new GZIPOutputStream(new FileOutputStream(new File(outPath, indexName + ".gz"))); } InputStream in = new GZIPInputStream(new FileInputStream(file)); log.debug("Copying {} into {}.gz ...", file.getName(), indexName); IOUtils.copyBytes(in, out, 8192, false); in.close(); file.delete(); prevIndexName = indexName; } if (out != null) out.close(); // build B+Tree indexes Location location = new Location(outPath); for (String idxName : Constants.indexNames) { log.debug("Creating {} index...", idxName); String indexFilename = location.absolute(idxName, "gz"); if (new File(indexFilename).exists()) { new File(outPath, idxName + ".dat").delete(); new File(outPath, idxName + ".idn").delete(); CmdIndexBuild.main(location.getDirectoryPath(), idxName, indexFilename); // To save some disk space new File(indexFilename).delete(); } } } public static void fixNodeTable2(Location location) throws IOException { ProgressLogger monitor = new ProgressLogger(log, "Data (1/2)", BulkLoader.DataTickPoint, BulkLoader.superTick); log.info("Data (1/2)..."); monitor.start(); NodeTableRewriter.fixNodeTable2(location, log, monitor); long time = monitor.finish(); long total = monitor.getTicks(); float elapsedSecs = time / 1000F; float rate = (elapsedSecs != 0) ? total / elapsedSecs : 0; String str = String.format("Total: %,d RDF nodes : %,.2f seconds : %,.2f nodes/sec [%s]", total, elapsedSecs, rate, nowAsString()); log.info(str); } public static void main(String[] args) throws Exception { int exitCode = ToolRunner.run(new download(), args); System.exit(exitCode); } }