Example usage for org.apache.hadoop.io MapWritable entrySet

Introduction

In this page you can find the example usage for org.apache.hadoop.io MapWritable entrySet.

Prototype

@Override
    public Set<Map.Entry<Writable, Writable>> entrySet()

Source Link

Usage

From source file:com.dasasian.chok.lucene.integration.LuceneClientTest.java

License:Apache License

@Test
public void testGetDetailsConcurrently() throws Exception {
    miniCluster.deployIndex(LuceneTestResources.INDEX1, 1);
    LuceneClient client = new LuceneClient(miniCluster.createInteractionProtocol());
    final Query query = new QueryParser(Version.LUCENE_30, "", new KeywordAnalyzer()).parse("content: the");
    final Hits hits = client.search(query, null, 10);
    assertNotNull(hits);// ww  w . ja  v a  2  s .  co m
    assertEquals(10, hits.getHits().size());
    List<MapWritable> detailList = client.getDetails(hits.getHits());
    assertEquals(hits.getHits().size(), detailList.size());
    for (int i = 0; i < detailList.size(); i++) {
        final MapWritable details1 = client.getDetails(hits.getHits().get(i));
        final MapWritable details2 = detailList.get(i);
        assertEquals(details1.entrySet(), details2.entrySet());
        final Set<Writable> keySet = details2.keySet();
        assertFalse(keySet.isEmpty());
        final Writable writable = details2.get(new Text("path"));
        assertNotNull(writable);
    }
    client.close();
}

From source file:com.digitalpebble.behemoth.solr.LucidWorksWriter.java

License:Apache License

protected SolrInputDocument convertToSOLR(BehemothDocument doc) {
    final SolrInputDocument inputDoc = new SolrInputDocument();
    // map from a Behemoth document to a SOLR one
    // the field names below should be modified
    // to match the SOLR schema
    inputDoc.setField("id", doc.getUrl());
    inputDoc.setField("text", doc.getText());

    LOG.debug("Adding field : id\t" + doc.getUrl());
    //LOG.debug("Adding field : text\t" + doc.getText());

    //Rely on LucidWorks field mapping to handle this, or the dynamic fields
    MapWritable metadata = doc.getMetadata();
    if (includeMetadata && metadata != null) {
        for (Entry<Writable, Writable> entry : metadata.entrySet()) {
            inputDoc.addField(entry.getKey().toString(), entry.getValue().toString());
        }/*from  w w w . ja  v a  2  s.  com*/
    }
    // iterate on the annotations of interest and
    // create a new field for each one
    // it is advised NOT to set frequent annotation types
    // such as token as this would generate a stupidly large
    // number of fields which won't be used by SOLR for
    // tokenizing anyway.
    // what you can do though is to concatenate the token values
    // to form a new content string separated by spaces

    // iterate on the annotations
    if (includeAnnotations) {
        Iterator<Annotation> iterator = doc.getAnnotations().iterator();
        while (iterator.hasNext()) {
            Annotation current = iterator.next();
            // check whether it belongs to a type we'd like to send to SOLR
            Map<String, String> featureField = fieldMapping.get(current.getType());
            if (featureField == null)
                continue;
            // iterate on the expected features
            for (String targetFeature : featureField.keySet()) {
                String SOLRFieldName = featureField.get(targetFeature);
                String value = null;
                // special case for covering text
                if ("*".equals(targetFeature)) {
                    value = doc.getText().substring((int) current.getStart(), (int) current.getEnd());
                }
                // get the value for the feature
                else {
                    value = current.getFeatures().get(targetFeature);
                }
                LOG.debug("Adding field : " + SOLRFieldName + "\t" + value);
                // skip if no value has been found
                if (value != null)
                    inputDoc.setField(SOLRFieldName, value);
            }
        }
    }

    float boost = 1.0f;
    inputDoc.setDocumentBoost(boost);
    return inputDoc;
}

From source file:com.digitalpebble.behemoth.solr.SOLRWriter.java

License:Apache License

protected SolrInputDocument convertToSOLR(BehemothDocument doc) {
    final SolrInputDocument inputDoc = new SolrInputDocument();
    // map from a Behemoth document to a SOLR one
    // the field names below should be modified
    // to match the SOLR schema
    inputDoc.setField("id", doc.getUrl());
    inputDoc.setField("text", doc.getText());

    LOG.debug("Adding field : id\t" + doc.getUrl());

    // Rely on the field mapping to handle this, or the dynamic
    // fields/*from  www.j  ava2 s  . c  o  m*/
    MapWritable metadata = doc.getMetadata();
    if (includeMetadata && metadata != null) {
        for (Entry<Writable, Writable> entry : metadata.entrySet()) {
            if (useMetadataPrefix) {
                String key = metadataPrefix + entry.getKey().toString();
                inputDoc.addField(key, entry.getValue().toString());
            } else {
                inputDoc.addField(entry.getKey().toString(), entry.getValue().toString());
            }
        }
    }
    // iterate on the annotations of interest and
    // create a new field for each one
    // it is advised NOT to set frequent annotation types
    // such as token as this would generate a stupidly large
    // number of fields which won't be used by SOLR for
    // tokenizing anyway.
    // what you can do though is to concatenate the token values
    // to form a new content string separated by spaces

    // iterate on the annotations
    if (includeAnnotations) {
        Iterator<Annotation> iterator = doc.getAnnotations().iterator();
        while (iterator.hasNext()) {
            Annotation current = iterator.next();
            // check whether it belongs to a type we'd like to send to SOLR
            Map<String, String> featureField = fieldMapping.get(current.getType());
            // special case of all annotations
            if (featureField == null && !includeAllAnnotations) {
                continue;
            }
            if (!includeAllAnnotations) {
                // iterate on the expected features
                for (String targetFeature : featureField.keySet()) {
                    String SOLRFieldName = featureField.get(targetFeature);
                    String value = null;
                    // special case for covering text
                    if ("*".equals(targetFeature)) {
                        value = doc.getText().substring((int) current.getStart(), (int) current.getEnd());
                    }
                    // get the value for the feature
                    else {
                        value = current.getFeatures().get(targetFeature);
                    }
                    LOG.debug("Adding field : " + SOLRFieldName + "\t" + value);
                    // skip if no value has been found
                    if (value != null)
                        inputDoc.addField(SOLRFieldName, value);
                }
            } else {
                for (Entry<String, String> e : current.getFeatures().entrySet()) {
                    inputDoc.addField(annotationPrefix + current.getType() + "." + e.getKey(), e.getValue());
                }
            }
        }
    }

    float boost = 1.0f;
    inputDoc.setDocumentBoost(boost);
    return inputDoc;
}

From source file:com.facebook.hiveio.mapreduce.output.HiveTools.java

License:Apache License

/**
 * Map hive record// ww w  .j  a va 2s  . c o  m
 *
 * @param conf Configuration
 * @param value data
 * @return hive record
 */
public static HiveWritableRecord mapToHiveRecord(Configuration conf, MapWritable value) {
    HiveTableSchema schema = HiveTableSchemas.lookup(conf, getHiveTableName());
    HiveWritableRecord record = HiveRecordFactory.newWritableRecord(schema);
    for (Map.Entry<Writable, Writable> entry : value.entrySet()) {
        IntWritable intKey = (IntWritable) entry.getKey();
        LongWritable longValue = (LongWritable) entry.getValue();
        record.set(intKey.get(), longValue.get());
    }
    return record;
}

From source file:com.jfolson.hive.serde.RTypedBytesWritableOutput.java

License:Apache License

public void writeMap(MapWritable mw) throws IOException {
    out.writeMapHeader(mw.size());/*from  www  . ja  v  a  2s .  co  m*/
    for (Map.Entry<Writable, Writable> entry : mw.entrySet()) {
        write(entry.getKey());
        write(entry.getValue());
    }
}

From source file:com.redgate.hadoop.hive.azuretables.AzureTablesRecordWriter.java

License:Apache License

/**
 * Writes a MapWriteable out to the Azure Table
 *//*  w w  w .  j  a  va 2  s . com*/
public void write(Writable w) throws IOException {
    MapWritable map = (MapWritable) w;
    properties.clear();
    for (Entry<Writable, Writable> e : map.entrySet()) {
        // TODO - more intelligent type mapping (make everything a string is
        // hardly subtle
        EntityProperty value = new EntityProperty(e.getValue().toString());
        properties.put(e.getKey().toString(), value);
    }
    DynamicTableEntity entity = new DynamicTableEntity(properties);
    entity.setPartitionKey(partitionKey);
    entity.setRowKey(UUID.randomUUID().toString());

    TableOperation op = TableOperation.insert(entity);

    try {
        tableClient.execute(table, op);
    } catch (StorageException e) {
        throw new IOException(e);
    }
}

From source file:com.TCG.Nutch_DNS.HostDbReducer.java

License:Apache License

public void reduce(Text key, Iterator<CrawlDatum> values, OutputCollector<Text, CrawlDatum> output,
        Reporter reporter) throws IOException {

    CrawlDatum fetch = new CrawlDatum();
    CrawlDatum old = new CrawlDatum();

    boolean fetchSet = false;
    boolean oldSet = false;
    byte[] signature = null;
    boolean multiple = false; // avoid deep copy when only single value exists
    linked.clear();//from  www  .j a v a2 s .c  o  m
    org.apache.hadoop.io.MapWritable metaFromParse = null;

    while (values.hasNext()) {
        CrawlDatum datum = values.next();
        if (!multiple && values.hasNext())
            multiple = true;
        if (CrawlDatum.hasDbStatus(datum)) {
            if (!oldSet) {
                if (multiple) {
                    old.set(datum);
                } else {
                    // no need for a deep copy - this is the only value
                    old = datum;
                }
                oldSet = true;
            } else {
                // always take the latest version
                if (old.getFetchTime() < datum.getFetchTime())
                    old.set(datum);
            }
            continue;
        }

        if (CrawlDatum.hasFetchStatus(datum)) {
            if (!fetchSet) {
                if (multiple) {
                    fetch.set(datum);
                } else {
                    fetch = datum;
                }
                fetchSet = true;
            } else {
                // always take the latest version
                if (fetch.getFetchTime() < datum.getFetchTime())
                    fetch.set(datum);
            }
            continue;
        }

        switch (datum.getStatus()) { // collect other info
        case CrawlDatum.STATUS_LINKED:
            CrawlDatum link;
            if (multiple) {
                link = new CrawlDatum();
                link.set(datum);
            } else {
                link = datum;
            }
            linked.insert(link);
            break;
        case CrawlDatum.STATUS_SIGNATURE:
            signature = datum.getSignature();
            break;
        case CrawlDatum.STATUS_PARSE_META:
            metaFromParse = datum.getMetaData();
            break;
        default:
            LOG.warn("Unknown status, key: " + key + ", datum: " + datum);
        }
    }

    // copy the content of the queue into a List
    // in reversed order
    int numLinks = linked.size();
    List<CrawlDatum> linkList = new ArrayList<CrawlDatum>(numLinks);
    for (int i = numLinks - 1; i >= 0; i--) {
        linkList.add(linked.pop());
    }

    // if it doesn't already exist, skip it
    if (!oldSet && !additionsAllowed)
        return;

    // if there is no fetched datum, perhaps there is a link
    if (!fetchSet && linkList.size() > 0) {
        fetch = linkList.get(0);
        fetchSet = true;
    }

    // still no new data - record only unchanged old data, if exists, and return
    if (!fetchSet) {
        if (oldSet) {// at this point at least "old" should be present
            output.collect(key, old);
            reporter.getCounter("CrawlDB status", CrawlDatum.getStatusName(old.getStatus())).increment(1);
        } else {
            LOG.warn("Missing fetch and old value, signature=" + signature);
        }
        return;
    }

    if (signature == null)
        signature = fetch.getSignature();
    long prevModifiedTime = oldSet ? old.getModifiedTime() : 0L;
    long prevFetchTime = oldSet ? old.getFetchTime() : 0L;

    // initialize with the latest version, be it fetch or link
    result.set(fetch);
    if (oldSet) {
        // copy metadata from old, if exists
        if (old.getMetaData().size() > 0) {
            result.putAllMetaData(old);
            // overlay with new, if any
            if (fetch.getMetaData().size() > 0)
                result.putAllMetaData(fetch);
        }
        // set the most recent valid value of modifiedTime
        if (old.getModifiedTime() > 0 && fetch.getModifiedTime() == 0) {
            result.setModifiedTime(old.getModifiedTime());
        }
    }

    switch (fetch.getStatus()) { // determine new status

    case CrawlDatum.STATUS_LINKED: // it was link
        if (oldSet) { // if old exists
            result.set(old); // use it
        } else {
            result = schedule.initializeSchedule(key, result);
            result.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
            try {
                scfilters.initialScore(key, result);
            } catch (ScoringFilterException e) {
                if (LOG.isWarnEnabled()) {
                    LOG.warn("Cannot filter init score for url " + key + ", using default: " + e.getMessage());
                }
                result.setScore(0.0f);
            }
        }
        break;

    case CrawlDatum.STATUS_FETCH_SUCCESS: // succesful fetch
    case CrawlDatum.STATUS_FETCH_REDIR_TEMP: // successful fetch, redirected
    case CrawlDatum.STATUS_FETCH_REDIR_PERM:
    case CrawlDatum.STATUS_FETCH_NOTMODIFIED: // successful fetch, notmodified
        // determine the modification status
        int modified = FetchSchedule.STATUS_UNKNOWN;
        if (fetch.getStatus() == CrawlDatum.STATUS_FETCH_NOTMODIFIED) {
            modified = FetchSchedule.STATUS_NOTMODIFIED;
        } else if (fetch.getStatus() == CrawlDatum.STATUS_FETCH_SUCCESS) {
            // only successful fetches (but not redirects, NUTCH-1422)
            // are detected as "not modified" by signature comparison
            if (oldSet && old.getSignature() != null && signature != null) {
                if (SignatureComparator._compare(old.getSignature(), signature) != 0) {
                    modified = FetchSchedule.STATUS_MODIFIED;
                } else {
                    modified = FetchSchedule.STATUS_NOTMODIFIED;
                }
            }
        }
        // set the schedule
        result = schedule.setFetchSchedule(key, result, prevFetchTime, prevModifiedTime, fetch.getFetchTime(),
                fetch.getModifiedTime(), modified);
        // set the result status and signature
        if (modified == FetchSchedule.STATUS_NOTMODIFIED) {
            result.setStatus(CrawlDatum.STATUS_DB_NOTMODIFIED);

            // NUTCH-1341 The page is not modified according to its signature, let's
            // reset lastModified as well
            result.setModifiedTime(prevModifiedTime);

            if (oldSet)
                result.setSignature(old.getSignature());
        } else {
            switch (fetch.getStatus()) {
            case CrawlDatum.STATUS_FETCH_SUCCESS:
                result.setStatus(CrawlDatum.STATUS_DB_FETCHED);
                break;
            case CrawlDatum.STATUS_FETCH_REDIR_PERM:
                result.setStatus(CrawlDatum.STATUS_DB_REDIR_PERM);
                break;
            case CrawlDatum.STATUS_FETCH_REDIR_TEMP:
                result.setStatus(CrawlDatum.STATUS_DB_REDIR_TEMP);
                break;
            default:
                LOG.warn("Unexpected status: " + fetch.getStatus() + " resetting to old status.");
                if (oldSet)
                    result.setStatus(old.getStatus());
                else
                    result.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
            }
            result.setSignature(signature);
        }

        // https://issues.apache.org/jira/browse/NUTCH-1656
        if (metaFromParse != null) {
            for (Entry<Writable, Writable> e : metaFromParse.entrySet()) {
                result.getMetaData().put(e.getKey(), e.getValue());
            }
        }

        // if fetchInterval is larger than the system-wide maximum, trigger
        // an unconditional recrawl. This prevents the page to be stuck at
        // NOTMODIFIED state, when the old fetched copy was already removed with
        // old segments.
        if (maxInterval < result.getFetchInterval())
            result = schedule.forceRefetch(key, result, false);
        break;
    case CrawlDatum.STATUS_SIGNATURE:
        if (LOG.isWarnEnabled()) {
            LOG.warn("Lone CrawlDatum.STATUS_SIGNATURE: " + key);
        }
        return;
    case CrawlDatum.STATUS_FETCH_RETRY: // temporary failure
        if (oldSet) {
            result.setSignature(old.getSignature()); // use old signature
        }
        result = schedule.setPageRetrySchedule(key, result, prevFetchTime, prevModifiedTime,
                fetch.getFetchTime());
        if (result.getRetriesSinceFetch() < retryMax) {
            result.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
        } else {
            result.setStatus(CrawlDatum.STATUS_DB_GONE);
            result = schedule.setPageGoneSchedule(key, result, prevFetchTime, prevModifiedTime,
                    fetch.getFetchTime());
        }
        break;

    case CrawlDatum.STATUS_FETCH_GONE: // permanent failure
        if (oldSet)
            result.setSignature(old.getSignature()); // use old signature
        result.setStatus(CrawlDatum.STATUS_DB_GONE);
        result = schedule.setPageGoneSchedule(key, result, prevFetchTime, prevModifiedTime,
                fetch.getFetchTime());
        break;

    default:
        throw new RuntimeException("Unknown status: " + fetch.getStatus() + " " + key);
    }

    try {
        scfilters.updateDbScore(key, oldSet ? old : null, result, linkList);
    } catch (Exception e) {
        if (LOG.isWarnEnabled()) {
            LOG.warn("Couldn't update score, key=" + key + ": " + e);
        }
    }
    // remove generation time, if any
    result.getMetaData().remove(Nutch.WRITABLE_GENERATE_TIME_KEY);
    output.collect(key, result);
    reporter.getCounter("CrawlDB status", CrawlDatum.getStatusName(result.getStatus())).increment(1);
}

From source file:io.apigee.lembos.mapreduce.converters.input.MapWritableConverter.java

License:Apache License

/**
 * Takes in a {@link MapWritable} and returns a {@link Scriptable} map.
 *
 * @param scope the JavaScript scope/*w ww.j  av  a 2 s .c  om*/
 * @param writable the value to convert
 *
 * @return the {@link Scriptable} map equivalent
 */
@Override
public Object toJavaScript(final Scriptable scope, final MapWritable writable) {
    final Map<Object, Object> writableMap = new HashMap<>();

    for (final Map.Entry<Writable, Writable> mapEntry : writable.entrySet()) {
        writableMap.put(ConversionUtils.writableToJS(mapEntry.getKey(), scope),
                ConversionUtils.writableToJS(mapEntry.getValue(), scope));
    }

    return JavaScriptUtils.asObject(scope, writableMap);
}

From source file:net.sf.katta.integrationTest.lib.lucene.LuceneClientTest.java

License:Apache License

@Test
public void testGetDetailsConcurrently() throws KattaException, ParseException, InterruptedException {
    deployTestIndices(1, 1);/*from  w  ww . j av a  2  s .c  o m*/
    ILuceneClient client = new LuceneClient(_miniCluster.getZkConfiguration());
    final Query query = new QueryParser(Version.LUCENE_35, "", new KeywordAnalyzer()).parse("content: the");
    final Hits hits = client.search(query, new String[] { INDEX_NAME }, 10);
    assertNotNull(hits);
    assertEquals(10, hits.getHits().size());
    List<MapWritable> detailList = client.getDetails(hits.getHits());
    assertEquals(hits.getHits().size(), detailList.size());
    for (int i = 0; i < detailList.size(); i++) {
        final MapWritable details1 = client.getDetails(hits.getHits().get(i));
        final MapWritable details2 = detailList.get(i);
        assertEquals(details1.entrySet(), details2.entrySet());
        final Set<Writable> keySet = details2.keySet();
        assertFalse(keySet.isEmpty());
        final Writable writable = details2.get(new Text("path"));
        assertNotNull(writable);
    }
    client.close();
}

From source file:nutchIndexer.NutchReduce.java

License:Open Source License

@Override
public void reduce(IntWritable docId, Iterable<MapWritable> documentsAnalyzed, Context context)
        throws IOException, InterruptedException {
    for (MapWritable documentAnalyzed : documentsAnalyzed) {
        for (MapWritable.Entry<Writable, Writable> termEntry : documentAnalyzed.entrySet()) {
            Text term = (Text) termEntry.getKey();
            IntWritable freq = (IntWritable) termEntry.getValue();
            Integer documentId = docId.get();
            this.invertedIndex.addPosting(term, documentId, freq);
        }/* www .  j  ava 2  s . c om*/
    }
}