Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.hadoop.jobs; import com.google.common.io.ByteStreams; import com.google.common.io.LimitInputStream; import java.io.BufferedReader; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.lang.*; import java.net.URI; import java.net.URISyntaxException; import java.net.URL; import java.net.URLConnection; import java.nio.charset.Charset; import java.text.ParseException; import java.util.Arrays; import java.util.List; import java.util.Map; import java.util.UUID; import java.util.concurrent.Callable; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; import java.util.logging.Logger; import org.apache.commons.io.FilenameUtils; import org.apache.commons.io.IOUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.MapReduceBase; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.RunningJob; import org.apache.hadoop.mapred.TextOutputFormat; import org.apache.hadoop.mapreduce.Job; //import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapred.TextInputFormat; import org.apache.hadoop.mapred.TextOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.archive.extract.ExtractingResourceFactoryMapper; import org.archive.extract.ExtractingResourceProducer; import org.archive.extract.ExtractorOutput; import org.archive.extract.ProducerUtils; import org.archive.extract.ResourceFactoryMapper; import org.archive.extract.WATExtractorOutput; import org.archive.format.gzip.GZIPFormatException; import org.archive.format.gzip.GZIPMemberSeries; import org.archive.format.gzip.GZIPMemberWriter; import org.archive.format.gzip.GZIPSeriesMember; import org.archive.hadoop.util.FilenameInputFormat; import org.archive.resource.Resource; import org.archive.resource.ResourceProducer; import org.archive.resource.producer.ARCFile; import org.archive.resource.producer.EnvelopedResourceFile; import org.archive.resource.producer.WARCFile; import org.archive.streamcontext.SimpleStream; import org.archive.util.DateUtils; import org.archive.util.FileNameSpec; import org.archive.util.HMACSigner; import org.archive.util.IAUtils; import org.archive.util.StringFieldExtractor.StringTuple; import org.archive.util.StringFieldExtractor; import org.archive.server.FileBackedInputStream; /** * ArchiveFileExtractor - Generate WAT files from (W)ARC files stored in HDFS */ public class ArchiveFileExtractor extends Configured implements Tool { public final static String TOOL_NAME = "ArchiveFileExtractor"; public final static String TOOL_DESCRIPTION = "Repackage records from ARC/WARC files into new ARC/WARC files in HDFS"; public static final Log LOG = LogFactory.getLog(ArchiveFileExtractor.class); public static class ArchiveFileExtractorMapper extends MapReduceBase implements Mapper<Object, Text, Text, Text> { private JobConf jobConf; private static final Charset UTF8 = Charset.forName("UTF-8"); private static int CR = 13; private static int LF = 10; private String timestamp14; private String timestampZ; private byte[] warcHeaderContents; private final static String ARC_PATTERN = "filedesc://%s 0.0.0.0 %s text/plain 76\n" + "1 0 InternetArchive\n" + "URL IP-address Archive-date Content-type Archive-length\n\n"; private final static String WARC_PATTERN = "WARC/1.0\r\n" + "WARC-Type: warcinfo\r\n" + "WARC-Date: %s\r\n" + "WARC-Filename: %s\r\n" + "WARC-Record-ID: <urn:uuid:%s>\r\n" + "Content-Type: application/warc-fields\r\n" + "Content-Length: %d\r\n\r\n"; private String getWARCRecordID() { return "urn:uuid:" + UUID.randomUUID().toString(); } private byte[] getARCHeader(String name) { return String.format(ARC_PATTERN, name, timestamp14).getBytes(UTF8); } private byte[] getWARCHeader(String name) throws IOException { String t = String.format(WARC_PATTERN, timestampZ, name, getWARCRecordID(), warcHeaderContents.length + 4); byte[] b = t.getBytes(UTF8); ByteArrayOutputStream baos = new ByteArrayOutputStream(); baos.write(b); baos.write(warcHeaderContents); baos.write(CR); baos.write(LF); baos.write(CR); baos.write(LF); return baos.toByteArray(); } public byte[] getWarcHeaderContents() { return warcHeaderContents; } public void setWarcHeaderContents(byte[] warcHeaderContents) { this.warcHeaderContents = warcHeaderContents; } private long getGZLength(InputStream is) throws IOException, GZIPFormatException { SimpleStream s = new SimpleStream(is); GZIPMemberSeries gzs = new GZIPMemberSeries(s, "range", 0, true); GZIPSeriesMember m = gzs.getNextMember(); m.skipMember(); return m.getCompressedBytesRead(); } /** * <p>Configures the job.</p> * * @param job The job configuration. */ public void configure(JobConf job) { this.jobConf = job; } /** * Generate WAT file for the (w)arc file named in the * <code>key</code> */ public void map(Object key, Text value, OutputCollector output, Reporter reporter) throws IOException { String inputString = value.toString(); String[] inputParts = inputString.split("\t"); if (inputParts.length != 2) { throw new IOException("invalid input"); } boolean openArc = false; boolean openWarc = false; FileNameSpec warcNamer; FileNameSpec arcNamer; String hmacName = this.jobConf.get("hmacName", ""); String hmacSignature = this.jobConf.get("hmacSignature", ""); HMACSigner signer = null; if (hmacName != null && hmacSignature != null && !hmacName.isEmpty() && !hmacSignature.isEmpty()) signer = new HMACSigner(hmacSignature, hmacName); timestamp14 = this.jobConf.get("timestamp14", DateUtils.get14DigitDate(System.currentTimeMillis())); String warcHeaderString = this.jobConf.get("warcHeaderString"); warcHeaderContents = warcHeaderString.getBytes(UTF8); String outputDir = this.jobConf.get("outputDir"); FileSystem hdfsSys = null; FSDataOutputStream currentArcOS = null; FSDataOutputStream currentWarcOS = null; try { long msse = DateUtils.parse14DigitDate(timestamp14).getTime(); timestampZ = DateUtils.getLog17Date(msse); } catch (ParseException e) { LOG.error("Error parsing timestamp: ", e); throw new IOException(e); } String prefix = inputParts[0]; prefix += "-"; String resourceLocationBagString = inputParts[1]; arcNamer = new FileNameSpec(prefix, ".arc.gz"); warcNamer = new FileNameSpec(prefix, ".warc.gz"); boolean firstArcRecord = true; boolean firstWarcRecord = true; //remove braces resourceLocationBagString = resourceLocationBagString.replaceAll("[{}]", ""); //split into tuples resourceLocationBagString = resourceLocationBagString.replace("),(", ")\t("); //remove parentheses resourceLocationBagString = resourceLocationBagString.replaceAll("[()]", ""); //inputs String[] resourceLocations = resourceLocationBagString.split("\t"); FileBackedInputStream fbis = null; InputStream is = null; long millis = System.currentTimeMillis(); String destArcOutputFileString = null; String destWarcOutputFileString = null; for (int i = 0; i < resourceLocations.length; i++) { String[] offLoc = resourceLocations[i].split(","); long offset = Long.parseLong(offLoc[0]); String url = offLoc[1]; boolean isArc = false; try { if (url.endsWith(".arc.gz")) { isArc = true; } else if (url.endsWith(".warc.gz")) { } else { throw new IOException("URL (" + url + ") must end with '.arc.gz' or '.warc.gz'"); } if (url.startsWith("http://")) { URL u = new URL(url); URLConnection conn = u.openConnection(); conn.setRequestProperty("Range", String.format("bytes=%d-", offset)); if (signer != null) conn.setRequestProperty("Cookie", signer.getHMacCookieStr(1000)); conn.connect(); is = conn.getInputStream(); } else if (url.startsWith("hdfs://")) { URI u = new URI(url); //only initialize the FS once if (hdfsSys == null) { URI defaultURI = new URI(u.getScheme() + "://" + u.getHost() + ":" + u.getPort() + "/"); hdfsSys = FileSystem.get(defaultURI, new Configuration()); } Path path = new Path(u.getPath()); FSDataInputStream fis = hdfsSys.open(path); fis.seek(offset); is = fis; } fbis = new FileBackedInputStream(is); long length = getGZLength(fbis); InputStream orig = fbis.getInputStream(); if (isArc) { if (firstArcRecord) { String newArcName = arcNamer.getNextName(); destArcOutputFileString = this.jobConf.get("outputDir") + "/" + newArcName; String outputFileString = destArcOutputFileString + "." + millis + ".TMP"; currentArcOS = FileSystem.get(new java.net.URI(outputFileString), this.jobConf) .create(new Path(outputFileString), false); byte[] header = getARCHeader(newArcName); GZIPMemberWriter w = new GZIPMemberWriter(currentArcOS); w.write(new ByteArrayInputStream(header)); firstArcRecord = false; } LimitInputStream lis = new LimitInputStream(orig, length); ByteStreams.copy(lis, currentArcOS); } else { if (firstWarcRecord) { String newWarcName = warcNamer.getNextName(); destWarcOutputFileString = this.jobConf.get("outputDir") + "/" + newWarcName; String outputFileString = destWarcOutputFileString + "." + millis + ".TMP"; currentWarcOS = FileSystem.get(new java.net.URI(outputFileString), this.jobConf) .create(new Path(outputFileString), false); byte[] header = getWARCHeader(newWarcName); GZIPMemberWriter w = new GZIPMemberWriter(currentWarcOS); w.write(new ByteArrayInputStream(header)); firstWarcRecord = false; } LimitInputStream lis = new LimitInputStream(orig, length); ByteStreams.copy(lis, currentWarcOS); } output.collect("SUCCESS", offset + "\t" + url); } catch (Exception e) { LOG.error("Error processing: ", e); output.collect("FAIL", offset + "\t" + url); if (!this.jobConf.getBoolean("soft", false)) { throw new IOException(e.toString() + "offset:" + offset + "url:" + url); } } finally { if (is != null) { is.close(); } if (fbis != null) { fbis.resetBacker(); } } } // end of for loop try { if (currentArcOS != null) { currentArcOS.close(); FileSystem.get(new java.net.URI(destArcOutputFileString + "." + millis + ".TMP"), this.jobConf) .rename(new Path(destArcOutputFileString + "." + millis + ".TMP"), new Path(destArcOutputFileString)); } if (currentWarcOS != null) { currentWarcOS.close(); FileSystem.get(new java.net.URI(destWarcOutputFileString + "." + millis + ".TMP"), this.jobConf) .rename(new Path(destWarcOutputFileString + "." + millis + ".TMP"), new Path(destWarcOutputFileString)); } } catch (Exception e) { LOG.error("Error processing: ", e); throw new IOException(e.toString() + "Error finalizing files"); } } } /** * Print usage */ public void printUsage() { String usage = "Usage: ArchiveFileExtractor [OPTIONS] <taskfile> <outputdir>\n"; usage += "\tOptions:\n"; usage += "\t\t-mappers NUM - try to run with approximately NUM map tasks (default: 10)\n"; usage += "\t\t-timestamp14 TS - The 14 digit timestamp to use\n"; usage += "\t\t-hmacname HMACNAME - The HMAC Name string\n"; usage += "\t\t-hmacsignature HMACSIG - The HMAC Signature string\n"; usage += "\t\t-warc-header-local-file LOCALPATH_TO_WARCHEADERFILE - The local file containing the WARC header to use\n"; usage += "\t\t-soft - tolerate task exceptions\n"; usage += "\t\t-timeout MILLISECONDS - mapred.task.timeout setting (default: 72000000)\n"; usage += "\t\t-failpct PCT - mapred.max.map.failures.percent (default: 0). Set to 10 to allow 10% of map tasks to fail\n"; usage += "\tThe taskfile contains lines of the form:\n"; usage += "\t\tFilePrefix<tab>Bag of (offset,FilePath) tuples\n"; usage += "\t\tFilePrefix is the prefix to be used by the extracted files\n"; usage += "\t\toffset is the start offset of a W/ARC record\n"; usage += "\t\tFilePath is a HTTP or HDFS URL to the file to extract from\n"; System.out.println(usage); } /** * Run the job. */ public int run(String[] args) throws Exception { if (args.length < 2) { printUsage(); return 1; } // Create a job configuration JobConf job = new JobConf(getConf()); // Job name uses output dir to help identify it to the operator. job.setJobName("Archive File Extractor"); // This is a map-only job, no reducers. job.setNumReduceTasks(0); // turn off speculative execution job.setBoolean("mapred.map.tasks.speculative.execution", false); // set timeout to a high value - 20 hours job.setInt("mapred.task.timeout", 72000000); //tolerate task exceptions job.setBoolean("soft", false); int arg = 0; int numMaps = 10; String DEFAULT_WARC_PATTERN = "software: %s Extractor\r\n" + "format: WARC File Format 1.0\r\n" + "conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf\r\n" + "publisher: Internet Archive\r\n" + "created: %s\r\n\r\n"; String warcHeaderString = String.format(DEFAULT_WARC_PATTERN, IAUtils.COMMONS_VERSION, DateUtils.getLog17Date(System.currentTimeMillis())); while (arg < args.length - 1) { if (args[arg].equals("-soft")) { job.setBoolean("soft", true); arg++; } else if (args[arg].equals("-mappers")) { arg++; numMaps = Integer.parseInt(args[arg]); job.setNumMapTasks(numMaps); arg++; } else if (args[arg].equals("-timestamp14")) { arg++; String timestamp14 = DateUtils.get14DigitDate(DateUtils.parse14DigitDate(args[arg])); job.set("timestamp14", timestamp14); arg++; } else if (args[arg].equals("-warc-header-local-file")) { arg++; File f = new File(args[arg]); FileInputStream fis = new FileInputStream(f); warcHeaderString = IOUtils.toString(fis, "UTF-8"); arg++; } else if (args[arg].equals("-hmacname")) { arg++; String hmacName = args[arg]; job.set("hmacName", hmacName); arg++; } else if (args[arg].equals("-hmacsignature")) { arg++; String hmacSignature = args[arg]; job.set("hmacSignature", hmacSignature); arg++; } else if (args[arg].equals("-timeout")) { arg++; int taskTimeout = Integer.parseInt(args[arg]); job.setInt("mapred.task.timeout", taskTimeout); arg++; } else if (args[arg].equals("-failpct")) { arg++; int failPct = Integer.parseInt(args[arg]); job.setInt("mapred.max.map.failures.percent", failPct); arg++; } else { break; } } job.set("warcHeaderString", warcHeaderString); if (args.length - 2 != arg) { printUsage(); return 1; } Path inputPath = new Path(args[arg]); arg++; String outputDir = args[arg]; arg++; job.set("outputDir", outputDir); Path outputPath = new Path(outputDir); job.setInputFormat(TextInputFormat.class); job.setOutputFormat(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setMapperClass(ArchiveFileExtractorMapper.class); job.setJarByClass(ArchiveFileExtractor.class); TextInputFormat.addInputPath(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); // Run the job! RunningJob rj = JobClient.runJob(job); if (!rj.isSuccessful()) { LOG.error("FAILED: " + rj.getID()); return 2; } return 0; } /** * Command-line driver. Runs the ArchiveFileExtractor as a Hadoop job. */ public static void main(String args[]) throws Exception { int result = ToolRunner.run(new Configuration(), new ArchiveFileExtractor(), args); System.exit(result); } }