Java tutorial
/* Copyright 2012-2013 Fabian Steeg. Licensed under the Eclipse Public License 1.0 */ package org.lobid.lodmill.hadoop; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.URI; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.Scanner; import java.util.concurrent.TimeUnit; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.elasticsearch.action.bulk.BulkItemResponse; import org.elasticsearch.action.bulk.BulkRequestBuilder; import org.elasticsearch.action.bulk.BulkResponse; import org.elasticsearch.action.index.IndexRequestBuilder; import org.elasticsearch.client.Client; import org.elasticsearch.client.IndicesAdminClient; import org.elasticsearch.client.transport.NoNodeAvailableException; import org.elasticsearch.client.transport.TransportClient; import org.elasticsearch.common.settings.ImmutableSettings; import org.elasticsearch.common.transport.InetSocketTransportAddress; import org.json.simple.JSONObject; import org.json.simple.JSONValue; import org.json.simple.parser.ParseException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.io.CharStreams; /** * Index JSON-LD Hadoop output in HDFS into an ElasticSearch instance. * * @author Fabian Steeg (fsteeg) */ public class IndexFromHdfsInElasticSearch { private static final int BULK_SIZE = 1000; private static final Logger LOG = LoggerFactory.getLogger(IndexFromHdfsInElasticSearch.class); private final FileSystem hdfs; private final Client client; /** * @param args Pass 4 params: hdfs-server hdfs-input-path es-host * es-cluster-name to index files in hdfs-input-path from HDFS on * hdfs-server into es-cluster-name on es-host. */ public static void main(final String[] args) { if (args.length != 4) { System.err.println("Pass 4 params: <hdfs-server> <hdfs-input-path>" + " <es-host> <es-cluster-name> to index files in" + " <hdfs-input-path> from HDFS on <hdfs-server> into" + " <es-cluster-name> on <es-host>."); System.exit(-1); } try (FileSystem hdfs = FileSystem.get(URI.create(args[0]), new Configuration())) { final Client client = new TransportClient(ImmutableSettings.settingsBuilder() .put("cluster.name", args[3]).put("client.transport.sniff", false) .put("client.transport.ping_timeout", 20, TimeUnit.SECONDS).build()) .addTransportAddress(new InetSocketTransportAddress(args[2], 9300)); final IndexFromHdfsInElasticSearch indexer = new IndexFromHdfsInElasticSearch(hdfs, client); indexer.indexAll(args[1].endsWith("/") ? args[1] : args[1] + "/"); } catch (IOException e) { e.printStackTrace(); } } /** * @param hdfs The HDFS to index from * @param client The ElasticSearch client for indexing */ public IndexFromHdfsInElasticSearch(final FileSystem hdfs, final Client client) { this.hdfs = hdfs; this.client = client; } /** * Index all data in the given directory * * @param dir The directory to index * @return A list of responses for requests that failed * @throws IOException When HDFS operations fail */ public List<BulkItemResponse> indexAll(final String dir) throws IOException { checkPathInHdfs(dir); final List<BulkItemResponse> result = new ArrayList<>(); final FileStatus[] listStatus = hdfs.listStatus(new Path(dir)); for (FileStatus fileStatus : listStatus) { LOG.info("Indexing: " + fileStatus.getPath().getName()); if (fileStatus.getPath().getName().startsWith("part-")) { result.addAll(indexOne(dir + fileStatus.getPath().getName())); } } return result; } /** * Index data at the given location * * @param data The data location * @return A list of responses for requests that failed * @throws IOException When HDFS operations fail */ public List<BulkItemResponse> indexOne(final String data) throws IOException { checkPathInHdfs(data); final FSDataInputStream inputStream = hdfs.open(new Path(data)); try (Scanner scanner = new Scanner(inputStream, "UTF-8")) { final List<BulkItemResponse> result = runBulkRequests(scanner, client); return result; } } /** * @param scanner The scanner for reading the data to index * @param c The elasticsearch client to use for indexing * @return A list responses for requests that errored */ public static List<BulkItemResponse> runBulkRequests(final Scanner scanner, Client c) { final List<BulkItemResponse> result = new ArrayList<>(); int lineNumber = 0; String meta = null; BulkRequestBuilder bulkRequest = c.prepareBulk(); while (scanner.hasNextLine()) { final String line = scanner.nextLine(); if (lineNumber % 2 == 0) { // every first line is index info meta = line; } else { // every second line is value object addIndexRequest(meta, bulkRequest, line, c); } lineNumber++; /* Split into multiple bulks to ease server load: */ if (lineNumber % BULK_SIZE == 0) { runBulkRequest(bulkRequest, result); bulkRequest = c.prepareBulk(); } } /* Run the final bulk, if there is anything to do: */ if (bulkRequest.numberOfActions() > 0) { runBulkRequest(bulkRequest, result); } return result; } private static void addIndexRequest(final String meta, final BulkRequestBuilder bulkRequest, final String line, Client c) { try { final Map<String, Object> map = (Map<String, Object>) JSONValue.parseWithException(line); final IndexRequestBuilder requestBuilder = createRequestBuilder(meta, map, c); bulkRequest.add(requestBuilder); } catch (ParseException e) { LOG.error( String.format("ParseException with meta '%s' and line '%s': '%s'", meta, line, e.getMessage()), e); } } private static void runBulkRequest(final BulkRequestBuilder bulkRequest, final List<BulkItemResponse> result) { final BulkResponse bulkResponse = executeBulkRequest(bulkRequest); if (bulkResponse == null) { LOG.error("Bulk request failed: " + bulkRequest); } else { collectFailedResponses(result, bulkResponse); } } private static void collectFailedResponses(final List<BulkItemResponse> result, final BulkResponse bulkResponse) { for (BulkItemResponse response : bulkResponse) { if (response.isFailed()) { LOG.error(String.format("Bulk item response failed for index '%s', ID '%s', message: %s", response.getIndex(), response.getId(), response.getFailureMessage())); result.add(response); } } } private static IndexRequestBuilder createRequestBuilder(final String meta, final Map<String, Object> map, Client c) { final JSONObject object = (JSONObject) ((JSONObject) JSONValue.parse(meta)).get("index"); final String index = (String) object.get("_index"); final String type = (String) object.get("_type"); final String id = (String) object.get("_id"); // NOPMD final IndicesAdminClient admin = c.admin().indices(); if (!admin.prepareExists(index).execute().actionGet().isExists()) { admin.prepareCreate(index).setSource(config()).execute().actionGet(); } return c.prepareIndex(index, type, id).setSource(map); } private static String config() { String res = null; try { final InputStream config = Thread.currentThread().getContextClassLoader() .getResourceAsStream("index-config.json"); try (InputStreamReader reader = new InputStreamReader(config, "UTF-8")) { res = CharStreams.toString(reader); } } catch (IOException e) { LOG.error(e.getMessage(), e); } return res; } private static BulkResponse executeBulkRequest(final BulkRequestBuilder bulk) { BulkResponse bulkResponse = null; int retries = 40; while (retries > 0) { try { bulkResponse = bulk.execute().actionGet(); break; } catch (NoNodeAvailableException e) { // Retry on NoNodeAvailableException, see // https://github.com/elasticsearch/elasticsearch/issues/1868 retries--; try { Thread.sleep(10000); } catch (InterruptedException x) { LOG.error(x.getMessage(), x); } LOG.error(String.format("Retry bulk index request after exception: %s (%s more retries)", e.getMessage(), retries)); } } return bulkResponse; } private void checkPathInHdfs(final String data) throws IOException { if (!hdfs.exists(new Path(data))) { throw new IllegalArgumentException("No such path in HDFS: " + data); } } }