Java tutorial
/******************************************************************************* * Copyright 2015, The IKANOW Open Source Project. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. *******************************************************************************/ package com.ikanow.aleph2.example.flume_harvester.services; import java.time.LocalDateTime; import java.util.ArrayList; import java.util.Arrays; import java.util.Date; import java.util.Map; import java.util.Optional; import java.util.regex.Pattern; import java.util.stream.Collectors; import org.apache.flume.Channel; import org.apache.flume.Context; import org.apache.flume.Event; import org.apache.flume.EventDeliveryException; import org.apache.flume.Transaction; import org.apache.flume.conf.Configurable; import org.apache.flume.sink.AbstractSink; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import scala.Tuple2; import au.com.bytecode.opencsv.CSVParser; import com.codepoetics.protonpack.StreamUtils; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.node.ObjectNode; import com.google.common.collect.ImmutableSet; import com.ikanow.aleph2.data_model.interfaces.data_import.IHarvestContext; import com.ikanow.aleph2.data_model.interfaces.data_services.ISearchIndexService; import com.ikanow.aleph2.data_model.interfaces.data_services.IStorageService; import com.ikanow.aleph2.data_model.interfaces.shared_services.IDataServiceProvider; import com.ikanow.aleph2.data_model.interfaces.shared_services.IDataWriteService; import com.ikanow.aleph2.data_model.objects.data_import.DataBucketBean; import com.ikanow.aleph2.data_model.objects.data_import.DataBucketBean.MasterEnrichmentType; import com.ikanow.aleph2.data_model.utils.BeanTemplateUtils; import com.ikanow.aleph2.data_model.utils.ContextUtils; import com.ikanow.aleph2.data_model.utils.ErrorUtils; import com.ikanow.aleph2.data_model.utils.Lambdas; import com.ikanow.aleph2.data_model.utils.Optionals; import com.ikanow.aleph2.data_model.utils.Patterns; import com.ikanow.aleph2.data_model.utils.SetOnce; import com.ikanow.aleph2.data_model.utils.TimeUtils; import com.ikanow.aleph2.data_model.utils.Tuples; import com.ikanow.aleph2.example.flume_harvester.data_model.FlumeBucketConfigBean; import com.ikanow.aleph2.example.flume_harvester.data_model.FlumeBucketConfigBean.OutputConfig.CsvConfig; import com.ikanow.aleph2.example.flume_harvester.data_model.FlumeBucketConfigBean.OutputConfig.JsonConfig; import com.ikanow.aleph2.example.flume_harvester.utils.FlumeUtils; import fj.data.Either; import fj.data.Validation; /** Harvest sink - will provide some basic parsing (and maybe JS manipulation functionality in the future) * @author alex */ public class FlumeHarvesterSink extends AbstractSink implements Configurable { final static protected Logger _logger = LogManager.getLogger(); final protected static ObjectMapper _mapper = BeanTemplateUtils.configureMapper(Optional.empty()); IHarvestContext _context; Optional<FlumeBucketConfigBean> _config; DataBucketBean _bucket; Optional<String> _time_field; boolean _streaming = true; boolean _batch = true; /** Fixed config that writes batch output */ final static FlumeBucketConfigBean _BATCH_CONFIG = BeanTemplateUtils.build(FlumeBucketConfigBean.class) .with(FlumeBucketConfigBean::output, BeanTemplateUtils.build(FlumeBucketConfigBean.OutputConfig.class) .with(FlumeBucketConfigBean.OutputConfig::direct_output, ImmutableSet.<String>builder().add("batch").build()) .done().get()) .done().get(); /* (non-Javadoc) * @see org.apache.flume.conf.Configurable#configure(org.apache.flume.Context) */ @Override public void configure(Context flume_context) { try { _context = Lambdas.wrap_u(() -> ContextUtils.getHarvestContext( FlumeUtils.decodeSignature(flume_context.getString("context_signature", "")))).get(); _bucket = _context.getBucket().get(); switch (Optional.ofNullable(_bucket.master_enrichment_type()).orElse(MasterEnrichmentType.none)) { case none: _streaming = false; _batch = false; break; case batch: _streaming = false; _batch = true; break; case streaming: _streaming = true; _batch = false; break; case streaming_and_batch: // (not really supported in many places) _streaming = true; _batch = true; break; } _logger.debug("Bucket = " + BeanTemplateUtils.toJson(_bucket)); // Get config from bucket _config = Optional.of(_bucket).map(b -> b.harvest_configs()).filter(h -> !h.isEmpty()) .map(h -> h.iterator().next()).map(hcfg -> hcfg.config()) .map(hmap -> BeanTemplateUtils.from(hmap, FlumeBucketConfigBean.class).get()); _logger.debug("_config = " + _config.map(BeanTemplateUtils::toJson).map(JsonNode::toString).orElse("(not present)")); _time_field = _config.flatMap(cfg -> Optionals.of(() -> cfg.output().add_time_with_name())) .map(Optional::of) // prio #1: if manually specified .orElse(Optionals.of(() -> _bucket.data_schema().temporal_schema()) .filter(schema -> Optional.ofNullable(schema.enabled()).orElse(true)) // prio #2: (but only if temporal enabled)... .map(schema -> schema.time_field()) // ...use the time field ); //DEBUG _logger.debug("_time_field = " + _time_field); } catch (Throwable t) { _logger.error("Error initializing flume", t); throw t; } } /* (non-Javadoc) * @see org.apache.flume.Sink#process() */ @Override public Status process() throws EventDeliveryException { Status status = null; //TODO (ALEPH-10): handy to know: there's a timeout that appears to occur, so can log a heartbeat every N seconds and use in the poll freq to ensure this thread hasn't crashed... //TODO (ALEPH-10): also have a log the first time an error occurs, and maybe hourly log messages reporting data sizes // Start transaction final Channel ch = getChannel(); final Transaction txn = ch.getTransaction(); txn.begin(); try { // This try clause includes whatever Channel operations you want to do final Event event = ch.take(); final Optional<JsonNode> maybe_json_event = getEventJson(event, _config) // Extra step .map(json -> _time_field.filter(tf -> !json.has(tf)) // (ie filter out JSON objects with the timestamp field, those are passed unchanged by the orElse .<JsonNode>map(tf -> ((ObjectNode) json).put(tf, LocalDateTime.now().toString())) // put the timestamp field in .orElse(json)); maybe_json_event.ifPresent(json_event -> { if (_config.map(cfg -> cfg.output()).map(out -> out.direct_output()).isPresent()) { this.directOutput(json_event, _config.get(), _bucket); } else { if (_streaming) { _context.sendObjectToStreamingPipeline(Optional.empty(), Either.left(json_event)); } if (_batch) { this.directOutput(json_event, _BATCH_CONFIG, _bucket); } } }); txn.commit(); status = Status.READY; } catch (Throwable t) { //DEBUG //_logger.warn("Error", t); txn.rollback(); // Log exception, handle individual exceptions as needed status = Status.BACKOFF; // re-throw all Errors if (t instanceof Error) { throw (Error) t; } } finally { txn.close(); } return status; } /** Uses whatever parser is configured to create a JsonNode out of the object * TODO (ALEPH-10): for now just does a simple JSON mapping * @param e * @param config * @return */ protected Optional<JsonNode> getEventJson(final Event evt, final Optional<FlumeBucketConfigBean> config) { if (null == evt) { // (seem to get lots of these) return Optional.empty(); } if (config.isPresent()) { final FlumeBucketConfigBean cfg = config.get(); if (null != cfg.output()) { if (null != cfg.output().csv() && cfg.output().csv().enabled()) { return getCsvEventJson(evt, cfg.output().csv()); } else if (null != cfg.output().json() && cfg.output().json().enabled()) { return getJsonEventJson(evt, cfg.output().json()); } } } // Backstop: return getDefaultEventJson(evt); } /** Default output - creates a JSON object with "message" containing the body of the event, and a timestamp * TODO (ALEPH-10): make @timestamp derived (eg from temporal schema?) * @param evt * @return */ protected Optional<JsonNode> getDefaultEventJson(final Event evt) { try { final JsonNode initial = _mapper.convertValue(evt.getHeaders(), JsonNode.class); return Optional.of(((ObjectNode) initial).put("message", new String(evt.getBody(), "UTF-8"))); } catch (Exception e) { return Optional.empty(); } } /** State object for JSON output * @author Alex */ public static class JsonState { public JsonConfig.JsonPolicy policy; public String include_body_with_name; }; protected final SetOnce<JsonState> _json = new SetOnce<>(); /** Generates a JSON object assuming the event body is a JSON object * @param evt * @param config * @return */ protected Optional<JsonNode> getJsonEventJson(final Event evt, JsonConfig config) { try { // Lazy initialization if (!_json.isSet()) { final JsonState json = new JsonState(); json.policy = config.json_policy(); json.include_body_with_name = Optional.ofNullable(config.include_body_with_name()) .orElse("message"); _json.set(json); } // Different cases depending on policy final JsonState json = _json.get(); if (JsonConfig.JsonPolicy.body == json.policy) { final String body = new String(evt.getBody(), "UTF-8"); final JsonNode initial = _mapper.readValue(body, JsonNode.class); return Optional.of(initial); } else if (JsonConfig.JsonPolicy.body_plus_headers == json.policy) { final String body = new String(evt.getBody(), "UTF-8"); final ObjectNode initial = (ObjectNode) _mapper.readValue(body, JsonNode.class); final JsonNode to_return = evt.getHeaders().entrySet().stream().reduce(initial, (acc, v) -> acc.put(v.getKey(), v.getValue()), (acc1, acc2) -> acc1); return Optional.of(to_return); } else if (JsonConfig.JsonPolicy.event == json.policy) { final String body = new String(evt.getBody(), "UTF-8"); final ObjectNode initial = (ObjectNode) _mapper.convertValue(evt.getHeaders(), JsonNode.class); return Optional.of(initial.put(json.include_body_with_name, body)); } else if (JsonConfig.JsonPolicy.event_no_body == json.policy) { final JsonNode initial = _mapper.convertValue(evt.getHeaders(), JsonNode.class); return Optional.of(initial); } else return Optional.empty(); // not support/possible } catch (Throwable t) { return Optional.empty(); } } /** State object for CSV output * @author Alex */ public static class CsvState { public CSVParser parser = null; public ArrayList<String> headers; public Pattern ignore_regex; public Map<String, String> type_map; }; protected final SetOnce<CsvState> _csv = new SetOnce<>(); /** Generates a JSON object assuming the event body is a CSV * @param evt * @param config * @return */ protected Optional<JsonNode> getCsvEventJson(final Event evt, CsvConfig config) { if (!_csv.isSet()) { final CsvState csv = new CsvState(); // Lazy initialization: csv.parser = new CSVParser(Optional.ofNullable(config.separator().charAt(0)).orElse(','), Optional.ofNullable(config.quote_char().charAt(0)).orElse('"'), Optional.ofNullable(config.escape_char().charAt(0)).orElse('\\')); csv.headers = new ArrayList<String>(Optionals.ofNullable(config.header_fields())); csv.type_map = !config.non_string_types().isEmpty() ? config.non_string_types() : config.non_string_type_map().entrySet().stream() // (reverse the order of the map to get fieldname -> type) .<Tuple2<String, String>>flatMap( kv -> kv.getValue().stream().map(v -> Tuples._2T(kv.getKey(), v))) .collect(Collectors.toMap(t2 -> t2._2().toString(), t2 -> t2._1().toString())); Optional.ofNullable(config.ignore_regex()) .ifPresent(regex -> csv.ignore_regex = Pattern.compile(regex)); _csv.set(csv); } try { final CsvState csv = _csv.get(); final String line = new String(evt.getBody(), "UTF-8"); if ((null != csv.ignore_regex) && csv.ignore_regex.matcher(line).matches()) { return Optional.empty(); } final String[] fields = csv.parser.parseLine(line); final ObjectNode ret_val_pre = StreamUtils.zipWithIndex(Arrays.stream(fields)) .reduce(_mapper.createObjectNode(), (acc, v) -> { if (v.getIndex() >= csv.headers.size()) return acc; else { final String field_name = csv.headers.get((int) v.getIndex()); if ((null == field_name) || field_name.isEmpty()) { return acc; } else { try { return addField(acc, field_name, v.getValue(), csv.type_map); } catch (Exception e) { return acc; } } } }, (acc1, acc2) -> acc1); // (can't occur in practice) ; final ObjectNode ret_val = config.append_event_fields().stream().reduce(ret_val_pre, (acc, v) -> { final String value = evt.getHeaders().get(v); return (null == value) ? acc : addField(acc, v, value, csv.type_map); }, (acc1, acc2) -> acc1 // (can't occur in practice) ); return Optional.of(ret_val); } catch (Exception e) { return Optional.empty(); } } protected static ObjectNode addField(final ObjectNode mutable_obj, final String field_name, final String value, Map<String, String> type_map) { return Patterns.match((String) type_map.get(field_name)).<ObjectNode>andReturn() .when(t -> null == t, __ -> mutable_obj.put(field_name, value)) //(string) .when(t -> t.equalsIgnoreCase("long"), __ -> mutable_obj.put(field_name, Long.parseLong(value))) .when(t -> t.equalsIgnoreCase("int") || t.equalsIgnoreCase("integer"), __ -> mutable_obj.put(field_name, Integer.parseInt(value))) .when(t -> t.equalsIgnoreCase("double") || t.equalsIgnoreCase("numeric"), __ -> mutable_obj.put(field_name, Double.parseDouble(value))) .when(t -> t.equalsIgnoreCase("float"), __ -> mutable_obj.put(field_name, Float.parseFloat(value))) .when(t -> t.equalsIgnoreCase("boolean"), __ -> mutable_obj.put(field_name, Boolean.parseBoolean(value))) .when(t -> t.equalsIgnoreCase("hex"), __ -> mutable_obj.put(field_name, Long.parseLong(value, 16))) .when(t -> t.equalsIgnoreCase("date"), __ -> { Validation<String, Date> res = TimeUtils.getSchedule(value, Optional.empty()); return res.validation(left -> mutable_obj, right -> mutable_obj.put(field_name, right.toString())); }).otherwise(__ -> mutable_obj.put(field_name, value)) // (string) ; } /** State object for direct output * @author Alex */ public static class DirectOutputState { public Optional<IDataWriteService<JsonNode>> search_index_service; public Optional<IDataWriteService.IBatchSubservice<JsonNode>> batch_search_index_service; public Optional<IDataWriteService<JsonNode>> storage_service; public Optional<IDataWriteService.IBatchSubservice<JsonNode>> bulk_storage_service; public Optional<IDataWriteService<JsonNode>> batch_input_service; public Optional<IDataWriteService.IBatchSubservice<JsonNode>> bulk_batch_input_service; public boolean also_stream; } final protected SetOnce<DirectOutputState> _direct = new SetOnce<>(); /** Outputs data directly into the Aleph2 data services without going via batch or streaming enrichment * @param json * @param config */ protected void directOutput(final JsonNode json, FlumeBucketConfigBean config, final DataBucketBean bucket) { try { // Lazy initialization if (!_direct.isSet()) { final DirectOutputState direct = new DirectOutputState(); // Search index service final Optional<ISearchIndexService> search_index_service = (config.output().direct_output() .contains("search_index_service") ? _context.getServiceContext().getSearchIndexService() : Optional.empty()); direct.search_index_service = search_index_service.flatMap(IDataServiceProvider::getDataService) .flatMap(s -> s.getWritableDataService(JsonNode.class, bucket, Optional.empty(), Optional.empty())); direct.batch_search_index_service = direct.search_index_service .flatMap(IDataWriteService::getBatchWriteSubservice); // Storage service final Optional<IStorageService> storage_service = (config.output().direct_output() .contains("storage_service") ? Optional.of(_context.getServiceContext().getStorageService()) : Optional.empty()); direct.storage_service = storage_service.flatMap(IDataServiceProvider::getDataService) .flatMap(s -> s.getWritableDataService(JsonNode.class, bucket, Optional.of(IStorageService.StorageStage.processed.toString()), Optional.empty())); direct.bulk_storage_service = direct.storage_service .flatMap(IDataWriteService::getBatchWriteSubservice); // Batch input (uses storage service logic) final Optional<IStorageService> batch_input_service = (config.output().direct_output() .contains("batch") ? Optional.of(_context.getServiceContext().getStorageService()) : Optional.empty()); direct.batch_input_service = batch_input_service.flatMap(IDataServiceProvider::getDataService) .flatMap(s -> s.getWritableDataService(JsonNode.class, bucket, Optional.of(IStorageService.StorageStage.transient_input.toString() + Optionals.of(() -> _config.get().output().batch_schema()) .map(b -> BeanTemplateUtils.toJson(b).toString()) .map(str -> ":" + str).orElse("")), Optional.empty())); direct.bulk_batch_input_service = direct.batch_input_service .flatMap(IDataWriteService::getBatchWriteSubservice); direct.also_stream = config.output().direct_output().contains("stream"); _direct.set(direct); } final DirectOutputState direct = _direct.get(); // Search index service direct.search_index_service.ifPresent(search_index_service -> { if (direct.batch_search_index_service.isPresent()) { direct.batch_search_index_service.get().storeObject(json); } else { search_index_service.storeObject(json); } }); // Storage service direct.storage_service.ifPresent(storage_service -> { if (direct.bulk_storage_service.isPresent()) { direct.bulk_storage_service.get().storeObject(json); } else { storage_service.storeObject(json); } }); //Batch input (uses storage service logic) direct.batch_input_service.ifPresent(storage_service -> { if (direct.bulk_batch_input_service.isPresent()) { direct.bulk_batch_input_service.get().storeObject(json); } else { storage_service.storeObject(json); } }); // Streaming if (direct.also_stream) { _context.sendObjectToStreamingPipeline(Optional.empty(), Either.left(json)); } } catch (Throwable t) { //DEBUG //System.out.println(ErrorUtils.getLongForm("{0}", t)); //DEBUG //_logger.warn("Error", t); } } }