Java tutorial
/** * Copyright 2005 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.tools; import java.io.File; import java.io.FileFilter; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Vector; import java.util.logging.Logger; import org.apache.nutch.fetcher.FetcherOutput; import org.apache.nutch.indexer.IndexSegment; import org.apache.nutch.io.MD5Hash; import org.apache.nutch.fs.*; import org.apache.nutch.parse.ParseData; import org.apache.nutch.parse.ParseText; import org.apache.nutch.protocol.Content; import org.apache.nutch.segment.SegmentReader; import org.apache.nutch.segment.SegmentWriter; import org.apache.nutch.util.LogFormatter; import org.apache.nutch.util.NutchConf; import org.apache.lucene.analysis.WhitespaceAnalyzer; import org.apache.lucene.document.DateField; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermDocs; import org.apache.lucene.index.TermEnum; /** * This class cleans up accumulated segments data, and merges them into a single * (or optionally multiple) segment(s), with no duplicates in it. * * <p> * There are no prerequisites for its correct * operation except for a set of already fetched segments (they don't have to * contain parsed content, only fetcher output is required). This tool does not * use DeleteDuplicates, but creates its own "master" index of all pages in all * segments. Then it walks sequentially through this index and picks up only * most recent versions of pages for every unique value of url or hash. * </p> * <p>If some of the input segments are corrupted, this tool will attempt to * repair them, using * {@link org.apache.nutch.segment.SegmentReader#fixSegment(NutchFileSystem, File, boolean, boolean, boolean, boolean)} method.</p> * <p>Output segment can be optionally split on the fly into several segments of fixed * length.</p> * <p> * The newly created segment(s) can be then optionally indexed, so that it can be * either merged with more new segments, or used for searching as it is. * </p> * <p> * Old segments may be optionally removed, because all needed data has already * been copied to the new merged segment. NOTE: this tool will remove also all * corrupted input segments, which are not useable anyway - however, this option * may be dangerous if you inadvertently included non-segment directories as * input...</p> * <p> * You may want to run SegmentMergeTool instead of following the manual procedures, * with all options turned on, i.e. to merge segments into the output segment(s), * index it, and then delete the original segments data. * </p> * * @author Andrzej Bialecki <ab@getopt.org> */ public class SegmentMergeTool implements Runnable { public static final Logger LOG = LogFormatter.getLogger("org.apache.nutch.tools.SegmentMergeTool"); /** Log progress update every LOG_STEP items. */ public static int LOG_STEP = 20000; /** Temporary de-dup index size. Larger indexes tend to slow down indexing. * Too many indexes slow down the subsequent index merging. It's a tradeoff value... */ public static int INDEX_SIZE = 250000; public static int INDEX_MERGE_FACTOR = 3000; public static int INDEX_MIN_MERGE_DOCS = 16; private boolean boostByLinkCount = NutchConf.get().getBoolean("indexer.boost.by.link.count", false); private float scorePower = NutchConf.get().getFloat("indexer.score.power", 0.5f); private NutchFileSystem nfs = null; private File[] segments = null; private int stage = SegmentMergeStatus.STAGE_OPENING; private long totalRecords = 0L; private long processedRecords = 0L; private long start = 0L; private long maxCount = Long.MAX_VALUE; private File output = null; private List segdirs = null; private List allsegdirs = null; private boolean runIndexer = false; private boolean delSegs = false; private HashMap readers = new HashMap(); /** * Create a SegmentMergeTool. * @param nfs filesystem * @param segments list of input segments * @param output output directory, where output segments will be created * @param maxCount maximum number of records per output segment. If this * value is 0, then the default value {@link Long#MAX_VALUE} is used. * @param runIndexer run indexer on output segment(s) * @param delSegs delete input segments when finished * @throws Exception */ public SegmentMergeTool(NutchFileSystem nfs, File[] segments, File output, long maxCount, boolean runIndexer, boolean delSegs) throws Exception { this.nfs = nfs; this.segments = segments; this.runIndexer = runIndexer; this.delSegs = delSegs; if (maxCount > 0) this.maxCount = maxCount; allsegdirs = Arrays.asList(segments); this.output = output; if (nfs.exists(output)) { if (!nfs.isDirectory(output)) throw new Exception("Output is not a directory: " + output); } else nfs.mkdirs(output); } public static class SegmentMergeStatus { public static final int STAGE_OPENING = 0; public static final int STAGE_MASTERIDX = 1; public static final int STAGE_MERGEIDX = 2; public static final int STAGE_DEDUP = 3; public static final int STAGE_WRITING = 4; public static final int STAGE_INDEXING = 5; public static final int STAGE_DELETING = 6; public static final String[] stages = { "opening input segments", "creating master index", "merging sub-indexes", "deduplicating", "writing output segment(s)", "indexing output segment(s)", "deleting input segments" }; public int stage; public File[] inputSegments; public long startTime, curTime; public long totalRecords; public long processedRecords; public SegmentMergeStatus() { }; public SegmentMergeStatus(int stage, File[] inputSegments, long startTime, long totalRecords, long processedRecords) { this.stage = stage; this.inputSegments = inputSegments; this.startTime = startTime; this.curTime = System.currentTimeMillis(); this.totalRecords = totalRecords; this.processedRecords = processedRecords; } } public SegmentMergeStatus getStatus() { SegmentMergeStatus status = new SegmentMergeStatus(stage, segments, start, totalRecords, processedRecords); return status; } /** Run the tool, periodically reporting progress. */ public void run() { start = System.currentTimeMillis(); stage = SegmentMergeStatus.STAGE_OPENING; long delta; LOG.info("* Opening " + allsegdirs.size() + " segments:"); try { segdirs = new ArrayList(); // open all segments for (int i = 0; i < allsegdirs.size(); i++) { File dir = (File) allsegdirs.get(i); SegmentReader sr = null; try { // try to autofix it if corrupted... sr = new SegmentReader(nfs, dir, true); } catch (Exception e) { // this segment is hosed beyond repair, don't use it LOG.warning("* Segment " + dir.getName() + " is corrupt beyond repair; skipping it."); continue; } segdirs.add(dir); totalRecords += sr.size; LOG.info(" - segment " + dir.getName() + ": " + sr.size + " records."); readers.put(dir.getName(), sr); } long total = totalRecords; LOG.info("* TOTAL " + total + " input records in " + segdirs.size() + " segments."); LOG.info("* Creating master index..."); stage = SegmentMergeStatus.STAGE_MASTERIDX; // XXX Note that Lucene indexes don't work with NutchFileSystem for now. // XXX For now always assume LocalFileSystem here... Vector masters = new Vector(); File fsmtIndexDir = new File(output, ".fastmerge_index"); File masterDir = new File(fsmtIndexDir, "0"); if (!masterDir.mkdirs()) { LOG.severe("Could not create a master index dir: " + masterDir); return; } masters.add(masterDir); IndexWriter iw = new IndexWriter(masterDir, new WhitespaceAnalyzer(), true); iw.setUseCompoundFile(false); iw.setMergeFactor(INDEX_MERGE_FACTOR); iw.setRAMBufferSizeMB(INDEX_MIN_MERGE_DOCS); long s1 = System.currentTimeMillis(); Iterator it = readers.values().iterator(); processedRecords = 0L; delta = System.currentTimeMillis(); while (it.hasNext()) { SegmentReader sr = (SegmentReader) it.next(); String name = sr.segmentDir.getName(); FetcherOutput fo = new FetcherOutput(); for (long i = 0; i < sr.size; i++) { try { if (!sr.get(i, fo, null, null, null)) break; Document doc = new Document(); // compute boost float boost = IndexSegment.calculateBoost(fo.getFetchListEntry().getPage().getScore(), scorePower, boostByLinkCount, fo.getAnchors().length); // doc.add(new Field("sd", name + "|" + i, true, false, false)); // doc.add(new Field("uh", MD5Hash.digest(fo.getUrl().toString()).toString(), true, true, false)); // doc.add(new Field("ch", fo.getMD5Hash().toString(), true, true, false)); // doc.add(new Field("time", DateField.timeToString(fo.getFetchDate()), true, false, false)); // doc.add(new Field("score", boost + "", true, false, false)); // doc.add(new Field("ul", fo.getUrl().toString().length() + "", true, false, false)); iw.addDocument(doc); processedRecords++; if (processedRecords > 0 && (processedRecords % LOG_STEP == 0)) { LOG.info(" Processed " + processedRecords + " records (" + (float) (LOG_STEP * 1000) / (float) (System.currentTimeMillis() - delta) + " rec/s)"); delta = System.currentTimeMillis(); } if (processedRecords > 0 && (processedRecords % INDEX_SIZE == 0)) { iw.optimize(); iw.close(); LOG.info(" - creating next subindex..."); masterDir = new File(fsmtIndexDir, "" + masters.size()); if (!masterDir.mkdirs()) { LOG.severe("Could not create a master index dir: " + masterDir); return; } masters.add(masterDir); iw = new IndexWriter(masterDir, new WhitespaceAnalyzer(), true); iw.setUseCompoundFile(false); iw.setMergeFactor(INDEX_MERGE_FACTOR); iw.setRAMBufferSizeMB(INDEX_MIN_MERGE_DOCS); } } catch (Throwable t) { // we can assume the data is invalid from now on - break here LOG.info(" - segment " + name + " truncated to " + (i + 1) + " records"); break; } } } iw.optimize(); LOG.info("* Creating index took " + (System.currentTimeMillis() - s1) + " ms"); s1 = System.currentTimeMillis(); // merge all other indexes using the latest IndexWriter (still open): if (masters.size() > 1) { LOG.info(" - merging subindexes..."); stage = SegmentMergeStatus.STAGE_MERGEIDX; IndexReader[] ireaders = new IndexReader[masters.size() - 1]; for (int i = 0; i < masters.size() - 1; i++) ireaders[i] = IndexReader.open((File) masters.get(i)); iw.addIndexes(ireaders); for (int i = 0; i < masters.size() - 1; i++) { ireaders[i].close(); FileUtil.fullyDelete((File) masters.get(i)); } } iw.close(); LOG.info("* Optimizing index took " + (System.currentTimeMillis() - s1) + " ms"); LOG.info("* Removing duplicate entries..."); stage = SegmentMergeStatus.STAGE_DEDUP; IndexReader ir = IndexReader.open(masterDir); int i = 0; long cnt = 0L; processedRecords = 0L; s1 = System.currentTimeMillis(); delta = s1; TermEnum te = ir.terms(); while (te.next()) { Term t = te.term(); if (t == null) continue; if (!(t.field().equals("ch") || t.field().equals("uh"))) continue; cnt++; processedRecords = cnt / 2; if (cnt > 0 && (cnt % (LOG_STEP * 2) == 0)) { LOG.info(" Processed " + processedRecords + " records (" + (float) (LOG_STEP * 1000) / (float) (System.currentTimeMillis() - delta) + " rec/s)"); delta = System.currentTimeMillis(); } // Enumerate all docs with the same URL hash or content hash TermDocs td = ir.termDocs(t); if (td == null) continue; if (t.field().equals("uh")) { // Keep only the latest version of the document with // the same url hash. Note: even if the content // hash is identical, other metadata may be different, so even // in this case it makes sense to keep the latest version. int id = -1; String time = null; Document doc = null; while (td.next()) { int docid = td.doc(); if (!ir.isDeleted(docid)) { doc = ir.document(docid); if (time == null) { time = doc.get("time"); id = docid; continue; } String dtime = doc.get("time"); // "time" is a DateField, and can be compared lexicographically if (dtime.compareTo(time) > 0) { if (id != -1) { ir.deleteDocument(id); } time = dtime; id = docid; } else { ir.deleteDocument(docid); } } } } else if (t.field().equals("ch")) { // Keep only the version of the document with // the highest score, and then with the shortest url. int id = -1; int ul = 0; float score = 0.0f; Document doc = null; while (td.next()) { int docid = td.doc(); if (!ir.isDeleted(docid)) { doc = ir.document(docid); if (ul == 0) { try { ul = Integer.parseInt(doc.get("ul")); score = Float.parseFloat(doc.get("score")); } catch (Exception e) { } ; id = docid; continue; } int dul = 0; float dscore = 0.0f; try { dul = Integer.parseInt(doc.get("ul")); dscore = Float.parseFloat(doc.get("score")); } catch (Exception e) { } ; int cmp = Float.compare(dscore, score); if (cmp == 0) { // equal scores, select the one with shortest url if (dul < ul) { if (id != -1) { ir.deleteDocument(id); } ul = dul; id = docid; } else { ir.deleteDocument(docid); } } else if (cmp < 0) { ir.deleteDocument(docid); } else { if (id != -1) { ir.deleteDocument(id); } ul = dul; id = docid; } } } } } // // keep the IndexReader open... // LOG.info("* Deduplicating took " + (System.currentTimeMillis() - s1) + " ms"); stage = SegmentMergeStatus.STAGE_WRITING; processedRecords = 0L; Vector outDirs = new Vector(); File outDir = new File(output, SegmentWriter.getNewSegmentName()); outDirs.add(outDir); LOG.info("* Merging all segments into " + output.getName()); s1 = System.currentTimeMillis(); delta = s1; nfs.mkdirs(outDir); SegmentWriter sw = new SegmentWriter(nfs, outDir, true); LOG.fine(" - opening first output segment in " + outDir.getName()); FetcherOutput fo = new FetcherOutput(); Content co = new Content(); ParseText pt = new ParseText(); ParseData pd = new ParseData(); int outputCnt = 0; for (int n = 0; n < ir.maxDoc(); n++) { if (ir.isDeleted(n)) { //System.out.println("-del"); continue; } Document doc = ir.document(n); String segDoc = doc.get("sd"); int idx = segDoc.indexOf('|'); String segName = segDoc.substring(0, idx); String docName = segDoc.substring(idx + 1); SegmentReader sr = (SegmentReader) readers.get(segName); long docid; try { docid = Long.parseLong(docName); } catch (Exception e) { continue; } try { // get data from the reader sr.get(docid, fo, co, pt, pd); } catch (Throwable thr) { // don't break the loop, because only one of the segments // may be corrupted... LOG.fine(" - corrupt record no. " + docid + " in segment " + sr.segmentDir.getName() + " - skipping."); continue; } sw.append(fo, co, pt, pd); outputCnt++; processedRecords++; if (processedRecords > 0 && (processedRecords % LOG_STEP == 0)) { LOG.info(" Processed " + processedRecords + " records (" + (float) (LOG_STEP * 1000) / (float) (System.currentTimeMillis() - delta) + " rec/s)"); delta = System.currentTimeMillis(); } if (processedRecords % maxCount == 0) { sw.close(); outDir = new File(output, SegmentWriter.getNewSegmentName()); LOG.fine(" - starting next output segment in " + outDir.getName()); nfs.mkdirs(outDir); sw = new SegmentWriter(nfs, outDir, true); outDirs.add(outDir); } } LOG.info("* Merging took " + (System.currentTimeMillis() - s1) + " ms"); ir.close(); sw.close(); FileUtil.fullyDelete(fsmtIndexDir); for (Iterator iter = readers.keySet().iterator(); iter.hasNext();) { SegmentReader sr = (SegmentReader) readers.get(iter.next()); sr.close(); } if (runIndexer) { stage = SegmentMergeStatus.STAGE_INDEXING; totalRecords = outDirs.size(); processedRecords = 0L; LOG.info("* Creating new segment index(es)..."); File workingDir = new File(output, "indexsegment-workingdir"); for (int k = 0; k < outDirs.size(); k++) { processedRecords++; if (workingDir.exists()) { FileUtil.fullyDelete(workingDir); } IndexSegment indexer = new IndexSegment(nfs, Integer.MAX_VALUE, (File) outDirs.get(k), workingDir); indexer.indexPages(); FileUtil.fullyDelete(workingDir); } } if (delSegs) { // This deletes also all corrupt segments, which are // unusable anyway stage = SegmentMergeStatus.STAGE_DELETING; totalRecords = allsegdirs.size(); processedRecords = 0L; LOG.info("* Deleting old segments..."); for (int k = 0; k < allsegdirs.size(); k++) { processedRecords++; FileUtil.fullyDelete((File) allsegdirs.get(k)); } } delta = System.currentTimeMillis() - start; float eps = (float) total / (float) (delta / 1000); LOG.info("Finished SegmentMergeTool: INPUT: " + total + " -> OUTPUT: " + outputCnt + " entries in " + ((float) delta / 1000f) + " s (" + eps + " entries/sec)."); } catch (Exception e) { e.printStackTrace(); LOG.severe(e.getMessage()); } } public static void main(String[] args) throws Exception { if (args.length < 1) { System.err.println("Too few arguments.\n"); usage(); System.exit(-1); } NutchFileSystem nfs = NutchFileSystem.parseArgs(args, 0); boolean runIndexer = false; boolean delSegs = false; long maxCount = Long.MAX_VALUE; String segDir = null; File output = null; Vector dirs = new Vector(); for (int i = 0; i < args.length; i++) { if (args[i] == null) continue; if (args[i].equals("-o")) { if (args.length > i + 1) { output = new File(args[++i]); continue; } else { LOG.severe("Required value of '-o' argument missing.\n"); usage(); return; } } else if (args[i].equals("-i")) { runIndexer = true; } else if (args[i].equals("-cm")) { LOG.warning("'-cm' option obsolete - ignored."); } else if (args[i].equals("-max")) { String cnt = args[++i]; try { maxCount = Long.parseLong(cnt); } catch (Exception e) { LOG.warning("Invalid count '" + cnt + "', setting to Long.MAX_VALUE."); } } else if (args[i].equals("-ds")) { delSegs = true; } else if (args[i].equals("-dir")) { segDir = args[++i]; } else dirs.add(new File(args[i])); } if (segDir != null) { File sDir = new File(segDir); if (!sDir.exists() || !sDir.isDirectory()) { LOG.warning("Invalid path: " + sDir); } else { File[] files = sDir.listFiles(new FileFilter() { public boolean accept(File f) { return f.isDirectory(); } }); if (files != null && files.length > 0) { for (int i = 0; i < files.length; i++) dirs.add(files[i]); } } } if (dirs.size() == 0) { LOG.severe("No input segments."); return; } if (output == null) output = ((File) dirs.get(0)).getParentFile(); SegmentMergeTool st = new SegmentMergeTool(nfs, (File[]) dirs.toArray(new File[0]), output, maxCount, runIndexer, delSegs); st.run(); } private static void usage() { System.err.println( "SegmentMergeTool (-local | -nfs ...) (-dir <input_segments_dir> | seg1 seg2 ...) [-o <output_segments_dir>] [-max count] [-i] [-ds]"); System.err.println("\t-dir <input_segments_dir>\tpath to directory containing input segments"); System.err.println("\tseg1 seg2 seg3\t\tindividual paths to input segments"); System.err.println( "\t-o <output_segment_dir>\t(optional) path to directory which will\n\t\t\t\tcontain output segment(s).\n\t\t\tNOTE: If not present, the original segments path will be used."); System.err.println("\t-max count\t(optional) output multiple segments, each with maximum 'count' entries"); System.err.println("\t-i\t\t(optional) index the output segment when finished merging."); System.err.println("\t-ds\t\t(optional) delete the original input segments when finished."); System.err.println(); } }