Example usage for org.apache.hadoop.io Text equals

List of usage examples for org.apache.hadoop.io Text equals

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text equals.

Prototype

@Override
public boolean equals(Object o) 

Source Link

Document

Returns true iff o is a Text with the same contents.

Usage

From source file:org.apache.accumulo.test.iterator.WholeRowIteratorTest.java

License:Apache License

private static TreeMap<Key, Value> createOutputData() {
    TreeMap<Key, Value> data = new TreeMap<>();

    Text row = null;
    List<Key> keys = new ArrayList<>();
    List<Value> values = new ArrayList<>();

    // Generate the output data from the input data
    for (Entry<Key, Value> entry : INPUT_DATA.entrySet()) {
        if (null == row) {
            row = entry.getKey().getRow();
        }//from   w w w .  j  a  v  a  2 s . co  m

        if (!row.equals(entry.getKey().getRow())) {
            // Moved to the next row
            try {
                // Serialize and save
                Value encoded = WholeRowIterator.encodeRow(keys, values);
                data.put(new Key(row), encoded);
            } catch (IOException e) {
                throw new RuntimeException(e);
            }

            // Empty the aggregated k-v's
            keys = new ArrayList<>();
            values = new ArrayList<>();
            // Set the new current row
            row = entry.getKey().getRow();
        }

        // Aggregate the current row
        keys.add(entry.getKey());
        values.add(entry.getValue());
    }

    if (!keys.isEmpty()) {
        try {
            Value encoded = WholeRowIterator.encodeRow(keys, values);
            data.put(new Key(row), encoded);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    return data;
}

From source file:org.apache.accumulo.test.performance.scan.CollectTabletStats.java

License:Apache License

private static void calcTabletStats(Connector conn, String table, Authorizations auths, int batchSize,
        KeyExtent ke, String[] columns) throws Exception {

    // long t1 = System.currentTimeMillis();

    Scanner scanner = conn.createScanner(table, auths);
    scanner.setBatchSize(batchSize);/*from w w w  . j a  v a  2 s.  c  o  m*/
    scanner.setRange(new Range(ke.getPrevEndRow(), false, ke.getEndRow(), true));

    for (String c : columns) {
        scanner.fetchColumnFamily(new Text(c));
    }

    Stat rowLen = new Stat();
    Stat cfLen = new Stat();
    Stat cqLen = new Stat();
    Stat cvLen = new Stat();
    Stat valLen = new Stat();
    Stat colsPerRow = new Stat();

    Text lastRow = null;
    int colsPerRowCount = 0;

    for (Entry<Key, Value> entry : scanner) {

        Key key = entry.getKey();
        Text row = key.getRow();

        if (lastRow == null) {
            lastRow = row;
        }

        if (!lastRow.equals(row)) {
            colsPerRow.addStat(colsPerRowCount);
            lastRow = row;
            colsPerRowCount = 0;
        }

        colsPerRowCount++;

        rowLen.addStat(row.getLength());
        cfLen.addStat(key.getColumnFamilyData().length());
        cqLen.addStat(key.getColumnQualifierData().length());
        cvLen.addStat(key.getColumnVisibilityData().length());
        valLen.addStat(entry.getValue().get().length);
    }

    synchronized (System.out) {
        System.out.println("");
        System.out.println("\tTablet " + ke.getUUID() + " statistics : ");
        printStat("Row length", rowLen);
        printStat("Column family length", cfLen);
        printStat("Column qualifier length", cqLen);
        printStat("Column visibility length", cvLen);
        printStat("Value length", valLen);
        printStat("Columns per row", colsPerRow);
        System.out.println("");
    }

}

From source file:org.apache.accumulo.tserver.tablet.TabletData.java

License:Apache License

public TabletData(KeyExtent extent, VolumeManager fs, Iterator<Entry<Key, Value>> entries) {
    final Text family = new Text();
    Text rowName = extent.getMetadataEntry();
    while (entries.hasNext()) {
        Entry<Key, Value> entry = entries.next();
        Key key = entry.getKey();
        Value value = entry.getValue();/*from  w w  w  .ja v  a2 s.co  m*/
        key.getColumnFamily(family);
        if (key.compareRow(rowName) != 0) {
            log.info("Unexpected metadata table entry for {}: {}", extent, key.getRow());
            continue;
        }
        if (ServerColumnFamily.TIME_COLUMN.hasColumns(entry.getKey())) {
            if (time == null) {
                time = value.toString();
            }
        } else if (DataFileColumnFamily.NAME.equals(family)) {
            FileRef ref = new FileRef(fs, key);
            dataFiles.put(ref, new DataFileValue(entry.getValue().get()));
        } else if (DIRECTORY_COLUMN.hasColumns(key)) {
            directory = value.toString();
        } else if (family.equals(LogColumnFamily.NAME)) {
            logEntris.add(LogEntry.fromKeyValue(key, entry.getValue()));
        } else if (family.equals(ScanFileColumnFamily.NAME)) {
            scanFiles.add(new FileRef(fs, key));
        } else if (FLUSH_COLUMN.hasColumns(key)) {
            flushID = Long.parseLong(value.toString());
        } else if (COMPACT_COLUMN.hasColumns(key)) {
            compactID = Long.parseLong(entry.getValue().toString());
        } else if (family.equals(LastLocationColumnFamily.NAME)) {
            lastLocation = new TServerInstance(value, key.getColumnQualifier());
        } else if (family.equals(BulkFileColumnFamily.NAME)) {
            Long id = Long.decode(value.toString());
            List<FileRef> lst = bulkImported.get(id);
            if (lst == null) {
                bulkImported.put(id, lst = new ArrayList<>());
            }
            lst.add(new FileRef(fs, key));
        } else if (PREV_ROW_COLUMN.hasColumns(key)) {
            KeyExtent check = new KeyExtent(key.getRow(), value);
            if (!check.equals(extent)) {
                throw new RuntimeException("Found bad entry for " + extent + ": " + check);
            }
        }
    }
    if (time == null && dataFiles.isEmpty() && extent.equals(RootTable.OLD_EXTENT)) {
        // recovery... old root tablet has no data, so time doesn't matter:
        time = TabletTime.LOGICAL_TIME_ID + "" + Long.MIN_VALUE;
    }
}

From source file:org.apache.accumulo.tserver.TabletServer.java

License:Apache License

static Value checkTabletMetadata(KeyExtent extent, TServerInstance instance,
        SortedMap<Key, Value> tabletsKeyValues, Text metadataEntry) throws AccumuloException {

    TServerInstance future = null;/*from   w ww. java2s .  c  om*/
    Value prevEndRow = null;
    Value dir = null;
    Value time = null;
    for (Entry<Key, Value> entry : tabletsKeyValues.entrySet()) {
        Key key = entry.getKey();
        if (!metadataEntry.equals(key.getRow())) {
            log.info("Unexpected row in tablet metadata " + metadataEntry + " " + key.getRow());
            return null;
        }
        Text cf = key.getColumnFamily();
        if (cf.equals(TabletsSection.FutureLocationColumnFamily.NAME)) {
            if (future != null) {
                throw new AccumuloException("Tablet has multiple future locations " + extent);
            }
            future = new TServerInstance(entry.getValue(), key.getColumnQualifier());
        } else if (cf.equals(TabletsSection.CurrentLocationColumnFamily.NAME)) {
            log.info("Tablet seems to be already assigned to "
                    + new TServerInstance(entry.getValue(), key.getColumnQualifier()));
            return null;
        } else if (TabletsSection.TabletColumnFamily.PREV_ROW_COLUMN.hasColumns(key)) {
            prevEndRow = entry.getValue();
        } else if (TabletsSection.ServerColumnFamily.DIRECTORY_COLUMN.hasColumns(key)) {
            dir = entry.getValue();
        } else if (TabletsSection.ServerColumnFamily.TIME_COLUMN.hasColumns(key)) {
            time = entry.getValue();
        }
    }

    if (prevEndRow == null) {
        throw new AccumuloException("Metadata entry does not have prev row (" + metadataEntry + ")");
    } else {
        KeyExtent ke2 = new KeyExtent(metadataEntry, prevEndRow);
        if (!extent.equals(ke2)) {
            log.info("Tablet prev end row mismatch " + extent + " " + ke2.getPrevEndRow());
            return null;
        }
    }

    if (dir == null) {
        throw new AccumuloException("Metadata entry does not have directory (" + metadataEntry + ")");
    }

    if (time == null && !extent.equals(RootTable.OLD_EXTENT)) {
        throw new AccumuloException("Metadata entry does not have time (" + metadataEntry + ")");
    }

    if (future == null) {
        log.info("The master has not assigned " + extent + " to " + instance);
        return null;
    }

    if (!instance.equals(future)) {
        log.info("Table " + extent + " has been assigned to " + future + " which is not " + instance);
        return null;
    }

    return dir;
}

From source file:org.apache.giraph.hadoop.BspTokenSelector.java

License:Apache License

@SuppressWarnings("unchecked")
@Override// w  ww.  j  a v a 2  s .  c o m
public Token<JobTokenIdentifier> selectToken(Text service,
        Collection<Token<? extends TokenIdentifier>> tokens) {
    if (service == null) {
        return null;
    }
    Text kindName = new Text("mapreduce.job");
    for (Token<? extends TokenIdentifier> token : tokens) {
        if (kindName.equals(token.getKind())) {
            return (Token<JobTokenIdentifier>) token;
        }
    }
    return null;
}

From source file:org.apache.gora.accumulo.store.AccumuloStore.java

License:Apache License

public ByteSequence populate(Iterator<Entry<Key, Value>> iter, T persistent) throws IOException {
    ByteSequence row = null;/*w  w  w.  ja  va2 s .  com*/

    Map<Utf8, Object> currentMap = null;
    List currentArray = null;
    Text currentFam = null;
    int currentPos = 0;
    Schema currentSchema = null;
    Field currentField = null;

    BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(new byte[0], null);

    while (iter.hasNext()) {
        Entry<Key, Value> entry = iter.next();

        if (row == null) {
            row = entry.getKey().getRowData();
        }
        byte[] val = entry.getValue().get();

        Field field = fieldMap.get(getFieldName(entry));

        if (currentMap != null) {
            if (currentFam.equals(entry.getKey().getColumnFamily())) {
                currentMap.put(new Utf8(entry.getKey().getColumnQualifierData().toArray()),
                        fromBytes(currentSchema, entry.getValue().get()));
                continue;
            } else {
                persistent.put(currentPos, currentMap);
                currentMap = null;
            }
        } else if (currentArray != null) {
            if (currentFam.equals(entry.getKey().getColumnFamily())) {
                currentArray.add(fromBytes(currentSchema, entry.getValue().get()));
                continue;
            } else {
                persistent.put(currentPos, new GenericData.Array<T>(currentField.schema(), currentArray));
                currentArray = null;
            }
        }

        switch (field.schema().getType()) {
        case MAP: // first entry only. Next are handled above on the next loop
            currentMap = new DirtyMapWrapper<Utf8, Object>(new HashMap<Utf8, Object>());
            currentPos = field.pos();
            currentFam = entry.getKey().getColumnFamily();
            currentSchema = field.schema().getValueType();

            currentMap.put(new Utf8(entry.getKey().getColumnQualifierData().toArray()),
                    fromBytes(currentSchema, entry.getValue().get()));
            break;
        case ARRAY:
            currentArray = new DirtyListWrapper<Object>(new ArrayList<Object>());
            currentPos = field.pos();
            currentFam = entry.getKey().getColumnFamily();
            currentSchema = field.schema().getElementType();
            currentField = field;

            currentArray.add(fromBytes(currentSchema, entry.getValue().get()));

            break;
        case UNION:// default value of null acts like union with null
            Schema effectiveSchema = field.schema().getTypes().get(firstNotNullSchemaTypeIndex(field.schema()));
            // map and array were coded without union index so need to be read the same way
            if (effectiveSchema.getType() == Type.ARRAY) {
                currentArray = new DirtyListWrapper<Object>(new ArrayList<Object>());
                currentPos = field.pos();
                currentFam = entry.getKey().getColumnFamily();
                currentSchema = field.schema().getElementType();
                currentField = field;

                currentArray.add(fromBytes(currentSchema, entry.getValue().get()));
                break;
            } else if (effectiveSchema.getType() == Type.MAP) {
                currentMap = new DirtyMapWrapper<Utf8, Object>(new HashMap<Utf8, Object>());
                currentPos = field.pos();
                currentFam = entry.getKey().getColumnFamily();
                currentSchema = effectiveSchema.getValueType();

                currentMap.put(new Utf8(entry.getKey().getColumnQualifierData().toArray()),
                        fromBytes(currentSchema, entry.getValue().get()));
                break;
            }
            // continue like a regular top-level union
        case RECORD:
            SpecificDatumReader<?> reader = new SpecificDatumReader<Schema>(field.schema());
            persistent.put(field.pos(), reader.read(null, DecoderFactory.get().binaryDecoder(val, decoder)));
            break;
        default:
            persistent.put(field.pos(), fromBytes(field.schema(), entry.getValue().get()));
        }
    }

    if (currentMap != null) {
        persistent.put(currentPos, currentMap);
    } else if (currentArray != null) {
        persistent.put(currentPos, new GenericData.Array<T>(currentField.schema(), currentArray));
    }

    persistent.clearDirty();

    return row;
}

From source file:org.apache.mahout.clustering.minhash.LastfmClusterEvaluator.java

License:Apache License

/**
 * Calculate the overall cluster precision by sampling clusters. Precision is
 * calculated as follows :-//from www . j av a  2s.  c o  m
 * 
 * 1. For a sample of all the clusters calculate the pair-wise similarity
 * (Jaccard coefficient) for items in the same cluster.
 * 
 * 2. Count true positives as items whose similarity is above specified
 * threshold.
 * 
 * 3. Precision = (true positives) / (total items in clusters sampled).
 * 
 * @param clusterFile
 *          The file containing cluster information
 * @param threshold
 *          Similarity threshold for containing two items in a cluster to be
 *          relevant. Must be between 0.0 and 1.0
 * @param samplePercentage
 *          Percentage of clusters to sample. Must be between 0.0 and 1.0
 */
private static void testPrecision(Path clusterFile, double threshold, double samplePercentage) {
    Configuration conf = new Configuration();
    Random rand = RandomUtils.getRandom();
    Text prevCluster = new Text();
    List<List<Integer>> listenerVectors = Lists.newArrayList();
    long similarListeners = 0;
    long allListeners = 0;
    int clustersProcessed = 0;
    for (Pair<Text, VectorWritable> record : new SequenceFileIterable<Text, VectorWritable>(clusterFile, true,
            conf)) {
        Text cluster = record.getFirst();
        VectorWritable point = record.getSecond();
        if (!cluster.equals(prevCluster)) {
            // We got a new cluster
            prevCluster.set(cluster.toString());
            // Should we check previous cluster ?
            if (rand.nextDouble() > samplePercentage) {
                listenerVectors.clear();
                continue;
            }
            int numListeners = listenerVectors.size();
            allListeners += numListeners;
            for (int i = 0; i < numListeners; i++) {
                List<Integer> listenerVector1 = listenerVectors.get(i);
                for (int j = i + 1; j < numListeners; j++) {
                    List<Integer> listenerVector2 = listenerVectors.get(j);
                    double similarity = computeSimilarity(listenerVector1, listenerVector2);
                    similarListeners += similarity >= threshold ? 1 : 0;
                }
            }
            listenerVectors.clear();
            clustersProcessed++;
            System.out.print('\r' + usedMemory() + " Clusters processed: " + clustersProcessed);
        }
        List<Integer> listeners = Lists.newArrayList();
        for (Vector.Element ele : point.get()) {
            listeners.add((int) ele.get());
        }
        listenerVectors.add(listeners);
    }
    System.out.println("\nTest Results");
    System.out.println("=============");
    System.out.println(" (A) Listeners in same cluster with simiarity above threshold (" + threshold + ") : "
            + similarListeners);
    System.out.println(" (B) All listeners: " + allListeners);
    NumberFormat format = NumberFormat.getInstance();
    format.setMaximumFractionDigits(2);
    double precision = (double) similarListeners / allListeners * 100.0;
    System.out.println(" Average cluster precision: A/B = " + format.format(precision));
}

From source file:org.apache.nutch.fetcher.FetcherThread.java

License:Apache License

private ParseStatus output(Text key, CrawlDatum datum, Content content, ProtocolStatus pstatus, int status,
        int outlinkDepth) throws InterruptedException {

    datum.setStatus(status);//from  w  ww  .j a v a 2  s. c  o  m
    datum.setFetchTime(System.currentTimeMillis());
    if (pstatus != null)
        datum.getMetaData().put(Nutch.WRITABLE_PROTO_STATUS_KEY, pstatus);

    ParseResult parseResult = null;
    if (content != null) {
        Metadata metadata = content.getMetadata();

        // store the guessed content type in the crawldatum
        if (content.getContentType() != null)
            datum.getMetaData().put(new Text(Metadata.CONTENT_TYPE), new Text(content.getContentType()));

        // add segment to metadata
        metadata.set(Nutch.SEGMENT_NAME_KEY, segmentName);
        // add score to content metadata so that ParseSegment can pick it up.
        try {
            scfilters.passScoreBeforeParsing(key, datum, content);
        } catch (Exception e) {
            if (LOG.isWarnEnabled()) {
                LOG.warn("{} {} Couldn't pass score, url {} ({})", getName(), Thread.currentThread().getId(),
                        key, e);
            }
        }

        if (status == CrawlDatum.STATUS_FETCH_SUCCESS) {
            if (parsing && !(skipTruncated && ParseSegment.isTruncated(content))) {
                try {
                    parseResult = this.parseUtil.parse(content);
                } catch (Exception e) {
                    LOG.warn("{} {} Error parsing: {}: {}", getName(), Thread.currentThread().getId(), key,
                            StringUtils.stringifyException(e));
                }
            }

            if (parseResult == null && (parsing || signatureWithoutParsing)) {
                byte[] signature = SignatureFactory.getSignature(conf).calculate(content,
                        new ParseStatus().getEmptyParse(conf));
                datum.setSignature(signature);
            }
        }

        /*
         * Store status code in content So we can read this value during parsing
         * (as a separate job) and decide to parse or not.
         */
        content.getMetadata().add(Nutch.FETCH_STATUS_KEY, Integer.toString(status));
    }

    try {
        context.write(key, new NutchWritable(datum));
        if (content != null && storingContent)
            context.write(key, new NutchWritable(content));
        if (parseResult != null) {
            for (Entry<Text, Parse> entry : parseResult) {
                Text url = entry.getKey();
                Parse parse = entry.getValue();
                ParseStatus parseStatus = parse.getData().getStatus();
                ParseData parseData = parse.getData();

                if (!parseStatus.isSuccess()) {
                    LOG.warn("{} {} Error parsing: {}: {}", getName(), Thread.currentThread().getId(), key,
                            parseStatus);
                    parse = parseStatus.getEmptyParse(conf);
                }

                // Calculate page signature. For non-parsing fetchers this will
                // be done in ParseSegment
                byte[] signature = SignatureFactory.getSignature(conf).calculate(content, parse);
                // Ensure segment name and score are in parseData metadata
                parseData.getContentMeta().set(Nutch.SEGMENT_NAME_KEY, segmentName);
                parseData.getContentMeta().set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature));
                // Pass fetch time to content meta
                parseData.getContentMeta().set(Nutch.FETCH_TIME_KEY, Long.toString(datum.getFetchTime()));
                if (url.equals(key))
                    datum.setSignature(signature);
                try {
                    scfilters.passScoreAfterParsing(url, content, parse);
                } catch (Exception e) {
                    if (LOG.isWarnEnabled()) {
                        LOG.warn("{} {} Couldn't pass score, url {} ({})", getName(),
                                Thread.currentThread().getId(), key, e);
                    }
                }

                String origin = null;

                // collect outlinks for subsequent db update
                Outlink[] links = parseData.getOutlinks();
                int outlinksToStore = Math.min(maxOutlinks, links.length);
                if (ignoreExternalLinks || ignoreInternalLinks) {
                    URL originURL = new URL(url.toString());
                    // based on domain?
                    if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) {
                        origin = URLUtil.getDomainName(originURL).toLowerCase();
                    }
                    // use host 
                    else {
                        origin = originURL.getHost().toLowerCase();
                    }
                }

                //used by fetchNode         
                if (fetchNode != null) {
                    fetchNode.setOutlinks(links);
                    fetchNode.setTitle(parseData.getTitle());
                    FetchNodeDb.getInstance().put(fetchNode.getUrl().toString(), fetchNode);
                }
                int validCount = 0;

                // Process all outlinks, normalize, filter and deduplicate
                List<Outlink> outlinkList = new ArrayList<>(outlinksToStore);
                HashSet<String> outlinks = new HashSet<>(outlinksToStore);
                for (int i = 0; i < links.length && validCount < outlinksToStore; i++) {
                    String toUrl = links[i].getToUrl();

                    if (toUrl.length() > maxOutlinkLength) {
                        continue;
                    }
                    toUrl = ParseOutputFormat.filterNormalize(url.toString(), toUrl, origin,
                            ignoreInternalLinks, ignoreExternalLinks, ignoreExternalLinksMode,
                            urlFiltersForOutlinks, urlExemptionFilters, normalizersForOutlinks);
                    if (toUrl == null) {
                        continue;
                    }

                    validCount++;
                    links[i].setUrl(toUrl);
                    outlinkList.add(links[i]);
                    outlinks.add(toUrl);
                }

                //Publish fetch report event 
                if (activatePublisher) {
                    FetcherThreadEvent reportEvent = new FetcherThreadEvent(PublishEventType.REPORT,
                            url.toString());
                    reportEvent.addOutlinksToEventData(outlinkList);
                    reportEvent.addEventData(Nutch.FETCH_EVENT_TITLE, parseData.getTitle());
                    reportEvent.addEventData(Nutch.FETCH_EVENT_CONTENTTYPE,
                            parseData.getContentMeta().get("content-type"));
                    reportEvent.addEventData(Nutch.FETCH_EVENT_SCORE, datum.getScore());
                    reportEvent.addEventData(Nutch.FETCH_EVENT_FETCHTIME, datum.getFetchTime());
                    reportEvent.addEventData(Nutch.FETCH_EVENT_CONTENTLANG,
                            parseData.getContentMeta().get("content-language"));
                    publisher.publish(reportEvent, conf);
                }
                // Only process depth N outlinks
                if (maxOutlinkDepth > 0 && outlinkDepth < maxOutlinkDepth) {
                    FetchItem ft = FetchItem.create(url, null, queueMode);
                    FetchItemQueue queue = ((FetchItemQueues) fetchQueues).getFetchItemQueue(ft.queueID);
                    queue.alreadyFetched.add(url.toString().hashCode());

                    context.getCounter("FetcherOutlinks", "outlinks_detected").increment(outlinks.size());

                    // Counter to limit num outlinks to follow per page
                    int outlinkCounter = 0;

                    String followUrl;

                    // Walk over the outlinks and add as new FetchItem to the queues
                    Iterator<String> iter = outlinks.iterator();
                    while (iter.hasNext() && outlinkCounter < maxOutlinkDepthNumLinks) {
                        followUrl = iter.next();

                        // Check whether we'll follow external outlinks
                        if (outlinksIgnoreExternal) {
                            if (!URLUtil.getHost(url.toString()).equals(URLUtil.getHost(followUrl))) {
                                continue;
                            }
                        }

                        // Already followed?
                        int urlHashCode = followUrl.hashCode();
                        if (queue.alreadyFetched.contains(urlHashCode)) {
                            continue;
                        }
                        queue.alreadyFetched.add(urlHashCode);

                        // Create new FetchItem with depth incremented
                        FetchItem fit = FetchItem.create(new Text(followUrl),
                                new CrawlDatum(CrawlDatum.STATUS_LINKED, interval), queueMode,
                                outlinkDepth + 1);

                        context.getCounter("FetcherOutlinks", "outlinks_following").increment(1);

                        ((FetchItemQueues) fetchQueues).addFetchItem(fit);

                        outlinkCounter++;
                    }
                }

                // Overwrite the outlinks in ParseData with the normalized and
                // filtered set
                parseData.setOutlinks(outlinkList.toArray(new Outlink[outlinkList.size()]));

                context.write(url, new NutchWritable(
                        new ParseImpl(new ParseText(parse.getText()), parseData, parse.isCanonical())));
            }
        }
    } catch (IOException e) {
        if (LOG.isErrorEnabled()) {
            LOG.error("fetcher caught:", e);
        }
    }

    // return parse status (of the "original" URL if the ParseResult contains
    // multiple parses) which allows Fetcher to follow meta-redirects
    if (parseResult != null && !parseResult.isEmpty()) {
        Parse p = parseResult.get(content.getUrl());
        if (p != null) {
            context.getCounter("ParserStatus", ParseStatus.majorCodes[p.getData().getStatus().getMajorCode()])
                    .increment(1);
            return p.getData().getStatus();
        }
    }
    return null;
}

From source file:org.apache.nutch.tools.arc.ArcSegmentCreator.java

License:Apache License

/**
 * <p>Parses the raw content of a single record to create output.  This method
 * is almost the same as the {@link org.apache.nutch.Fetcher#output} method in
 * terms of processing and output.  //from w  ww  . j  a v a2  s. co  m
 * 
 * @param output  The job output collector.
 * @param segmentName The name of the segment to create.
 * @param key The url of the record.
 * @param datum The CrawlDatum of the record.
 * @param content The raw content of the record
 * @param pstatus The protocol status
 * @param status The fetch status.
 * 
 * @return The result of the parse in a ParseStatus object.
 */
private ParseStatus output(OutputCollector<Text, NutchWritable> output, String segmentName, Text key,
        CrawlDatum datum, Content content, ProtocolStatus pstatus, int status) {

    // set the fetch status and the fetch time
    datum.setStatus(status);
    datum.setFetchTime(System.currentTimeMillis());
    if (pstatus != null)
        datum.getMetaData().put(Nutch.WRITABLE_PROTO_STATUS_KEY, pstatus);

    ParseResult parseResult = null;
    if (content != null) {
        Metadata metadata = content.getMetadata();
        // add segment to metadata
        metadata.set(Nutch.SEGMENT_NAME_KEY, segmentName);
        // add score to content metadata so that ParseSegment can pick it up.
        try {
            scfilters.passScoreBeforeParsing(key, datum, content);
        } catch (Exception e) {
            if (LOG.isWarnEnabled()) {
                LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
            }
        }

        try {

            // parse the content
            parseResult = this.parseUtil.parse(content);
        } catch (Exception e) {
            LOG.warn("Error parsing: " + key + ": " + StringUtils.stringifyException(e));
        }

        // set the content signature
        if (parseResult == null) {
            byte[] signature = SignatureFactory.getSignature(getConf()).calculate(content,
                    new ParseStatus().getEmptyParse(getConf()));
            datum.setSignature(signature);
        }

        try {
            output.collect(key, new NutchWritable(datum));
            output.collect(key, new NutchWritable(content));

            if (parseResult != null) {
                for (Entry<Text, Parse> entry : parseResult) {
                    Text url = entry.getKey();
                    Parse parse = entry.getValue();
                    ParseStatus parseStatus = parse.getData().getStatus();

                    if (!parseStatus.isSuccess()) {
                        LOG.warn("Error parsing: " + key + ": " + parseStatus);
                        parse = parseStatus.getEmptyParse(getConf());
                    }

                    // Calculate page signature. 
                    byte[] signature = SignatureFactory.getSignature(getConf()).calculate(content, parse);
                    // Ensure segment name and score are in parseData metadata
                    parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY, segmentName);
                    parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY,
                            StringUtil.toHexString(signature));
                    // Pass fetch time to content meta
                    parse.getData().getContentMeta().set(Nutch.FETCH_TIME_KEY,
                            Long.toString(datum.getFetchTime()));
                    if (url.equals(key))
                        datum.setSignature(signature);
                    try {
                        scfilters.passScoreAfterParsing(url, content, parse);
                    } catch (Exception e) {
                        if (LOG.isWarnEnabled()) {
                            LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
                        }
                    }
                    output.collect(url, new NutchWritable(new ParseImpl(new ParseText(parse.getText()),
                            parse.getData(), parse.isCanonical())));
                }
            }
        } catch (IOException e) {
            if (LOG.isErrorEnabled()) {
                LOG.error("ArcSegmentCreator caught:" + StringUtils.stringifyException(e));
            }
        }

        // return parse status if it exits
        if (parseResult != null && !parseResult.isEmpty()) {
            Parse p = parseResult.get(content.getUrl());
            if (p != null) {
                return p.getData().getStatus();
            }
        }
    }

    return null;
}

From source file:org.apache.rya.export.accumulo.parent.AccumuloParentMetadataRepository.java

License:Apache License

private MergeParentMetadata getMetadataFromTable() throws ParentMetadataDoesNotExistException {
    try {/*from  w  w w  . j  ava  2  s .  c  o  m*/
        // Create an Accumulo scanner that iterates through the metadata entries.
        final Scanner scanner = connector.createScanner(mergeParentMetadataTableName, new Authorizations());
        final Iterator<Entry<Key, Value>> entries = scanner.iterator();

        // No metadata has been stored in the table yet.
        if (!entries.hasNext()) {
            log.error("Could not find any MergeParentMetadata metadata in the table named: "
                    + mergeParentMetadataTableName);
        }

        // Fetch the metadata from the entries.
        String ryaInstanceName = null;
        Date timestamp = null;
        Date filterTimestamp = null;
        Long parentTimeOffset = null;

        while (entries.hasNext()) {
            final Entry<Key, Value> entry = entries.next();
            final Text columnQualifier = entry.getKey().getColumnQualifier();
            final byte[] value = entry.getValue().get();

            if (columnQualifier.equals(MERGE_PARENT_METADATA_RYA_INSTANCE_NAME)) {
                ryaInstanceName = STRING_LEXICODER.decode(value);
            } else if (columnQualifier.equals(MERGE_PARENT_METADATA_TIMESTAMP)) {
                timestamp = DATE_LEXICODER.decode(value);
            } else if (columnQualifier.equals(MERGE_PARENT_METADATA_FILTER_TIMESTAMP)) {
                filterTimestamp = DATE_LEXICODER.decode(value);
            } else if (columnQualifier.equals(MERGE_PARENT_METADATA_PARENT_TIME_OFFSET)) {
                parentTimeOffset = LONG_LEXICODER.decode(value);
            }
        }

        return new MergeParentMetadata(ryaInstanceName, timestamp, filterTimestamp, parentTimeOffset);
    } catch (final TableNotFoundException e) {
        throw new ParentMetadataDoesNotExistException(
                "Could not add results to a MergeParentMetadata because the MergeParentMetadata table does not exist.",
                e);
    } catch (final Exception e) {
        throw new ParentMetadataDoesNotExistException("Error occurred while getting merge parent metadata.", e);
    }
}