Java tutorial
/* * Geonames Solr Index - Harvester * Copyright (C) 2011 University of Southern Queensland * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ package com.googlecode.solrgeonames.harvester; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.io.UnsupportedEncodingException; import java.text.DateFormat; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Arrays; import java.util.Date; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import org.apache.lucene.search.BoostingQuery; import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer; import org.apache.solr.common.SolrInputDocument; import org.apache.solr.core.CoreContainer; import org.apache.solr.core.CoreDescriptor; import org.apache.solr.core.SolrConfig; import org.apache.solr.core.SolrCore; import org.apache.solr.core.SolrResourceLoader; import org.apache.solr.schema.IndexSchema; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * A harvester to parse and index the geo data into an embedded Solr index. * * @author Greg Pendlebury */ public class Harvester { /** Logging */ private static Logger log = LoggerFactory.getLogger(Harvester.class); /** Geonames uses tab-delimited files */ private static String DELIMITER = "\t"; /** Some constant counters */ private static int BATCH_SIZE = 20000; /** Solr file names */ private static String SOLR_CONFIG = "solrconfig.xml"; private static String SOLR_SCHEMA = "schema.xml"; /** If true, loading alternate names */ private static boolean withAlternateNames; private static String geonamesFileName; /** Solr comma separated value of countries codes to boost. */ private static List<String> countryIdsToBoost; /** Buffered Reader for line by line */ private BufferedReader reader; /** Basic date formatter */ private DateFormat dateFormat; /** Column mappings */ private static final Map<String, Integer> columns; static { columns = new LinkedHashMap(); columns.put("id", 0); columns.put("utf8_name", 1); columns.put("basic_name", 2); columns.put("alternate_names", 3); columns.put("latitude", 4); columns.put("longitude", 5); columns.put("feature_class", 6); columns.put("feature_code", 7); columns.put("country_code", 8); // Skip other Country Codes : 9 // Skip Admin Codes : 10-13 columns.put("population", 14); columns.put("elevation", 15); columns.put("gtopo30", 16); columns.put("timezone", 17); columns.put("date_modified", 18); } private List<String> columnsToExclude; /** Solr index */ private SolrCore solrCore; private CoreContainer solrContainer; private EmbeddedSolrServer solrServer; /** * Basic constructor. Instantiate our reader and Solr. * @param sourceFile: The input file to read * @param countryIdsToBoost Country identifiers to boost. * @param withAlternateNames Load alternate names field. * * @throws Exception if any errors occur */ public Harvester(File sourceFile, List<String> countryIdsToBoost, List<String> columnsToExclude) throws Exception { this.countryIdsToBoost = countryIdsToBoost; this.columnsToExclude = columnsToExclude; // Variables InputStream inStream = null; Reader fileReader = null; // Open a stream to file try { inStream = new FileInputStream(sourceFile); } catch (IOException ex) { log.error("Error opening file stream!"); throw new Exception(ex); } // Instantiate a UTF-8 reader from the stream try { fileReader = new InputStreamReader(inStream, "UTF-8"); } catch (UnsupportedEncodingException ex) { try { inStream.close(); } catch (IOException ioex) { log.error("Failed closing input stream"); } log.error("Error starting file reader!"); throw new Exception(ex); } reader = new BufferedReader(fileReader); // Time to bring Solr online // Find the Solr home String solrHome = System.getProperty("geonames.solr.home"); if (solrHome == null) { throw new Exception("No 'geonames.solr.home' provided!"); } solrServer = startSolr(solrHome); } /** * Start up an embedded Solr server. * * @param home: The path to the Solr home directory * @return EmbeddedSolrServer: The instantiated server * @throws Exception if any errors occur */ private EmbeddedSolrServer startSolr(String home) throws Exception { try { SolrConfig solrConfig = new SolrConfig(home, SOLR_CONFIG, null); IndexSchema schema = new IndexSchema(solrConfig, SOLR_SCHEMA, null); solrContainer = new CoreContainer(new SolrResourceLoader(SolrResourceLoader.locateSolrHome())); CoreDescriptor descriptor = new CoreDescriptor(solrContainer, "", solrConfig.getResourceLoader().getInstanceDir()); descriptor.setConfigName(solrConfig.getResourceName()); descriptor.setSchemaName(schema.getResourceName()); solrCore = new SolrCore(null, solrConfig.getDataDir(), solrConfig, schema, descriptor); solrContainer.register("", solrCore, false); return new EmbeddedSolrServer(solrContainer, ""); } catch (Exception ex) { log.error("\nFailed to start Solr server\n"); throw ex; } } /** * Return the current date/time. * * @return Date: A Date object with the current date/time. */ private Date now() { return new Date(); } /** * Return a formatted time String of the current time. * * @return String: The current time String in the format HH:MM:SS */ private String time() { return time(now()); } /** * Return a formatted time String for the supplied Date. * * @param date: The Date object to format * @return String: The formatted time String in the format HH:MM:SS */ private String time(Date date) { if (dateFormat == null) { dateFormat = new SimpleDateFormat("HH:mm:ss"); } return dateFormat.format(date); } /** * Get the data indicated by the field name, after looking up the index * from the columns map. * * @param data: An array of strings containing column data * @param field: The field name * @return String: The data in that field, NULL if the field does not exist */ private String get(String[] data, String field) { Integer index = columns.get(field); if (index == null) { log.error("Field does not exist: {}", field); return null; } return data[index]; } /** * Force a commit against the underlying Solr database. * */ private void commit() { try { solrServer.commit(); } catch (Exception ex) { log.error("Failed to commit: ", ex); } } /** * Force an optimize call against the underlying Solr database. * */ private void optimize() { try { solrServer.optimize(); } catch (Exception ex) { log.error("Failed to commit: ", ex); } } /** * Main processing loop for the function * * @param counter: The number of rows to execute during this loop * @param print: Debugging flag to print all data processed * @return int: The number of rows read this pass * @throws Exception if any errors occur */ public int loop(int counter, boolean print) throws Exception { String line = null; int i = 0; try { while (i < counter && (line = reader.readLine()) != null) { String[] row = line.split(DELIMITER); i++; if (print) { log.debug("===================="); log.debug("Line: {}", i); } process(row, print); } } catch (IOException ex) { throw new Exception(ex); } return i; } /** * Trivial test for empty Geonames data. Looks for null, empty strings, * or single space characters. * * @param input: The data to test * @return boolean: True if the data is consider 'empty', otherwise False */ private boolean empty(String input) { if (input == null || input.equals("") || input.equals(" ")) { return true; } return false; } /** * Process the row of data pulled from Geonames. * * @param row: A String array containing the columns of data * @param print: Debugging flag to print all data processed */ private void process(String[] row, boolean print) { if (print) { for (String key : columns.keySet()) { System.out.format("%17s => %20s\n", key, get(row, key)); } } try { solrServer.add(createSolrDoc(row)); } catch (Exception ex) { log.error("Failed to add document:"); for (String key : columns.keySet()) { System.out.format("%17s => %20s\n", key, get(row, key)); } log.error("Stack trace: ", ex); } } /** * Create a Solr document from the provided Geonames column data. * * @param row: A String array containing the columns of data * @return SolrInputDocument: The prepared document */ private SolrInputDocument createSolrDoc(String[] row) { float boost = 1.0f; SolrInputDocument doc = new SolrInputDocument(); for (String key : columns.keySet()) { if (columnsToExclude.contains(key)) { continue; } String data = get(row, key); // Fix dates if (key.equals("date_modified")) { data += "T00:00:00Z"; } // Sometimes the geonames 'asciiname' is empty if (key.equals("basic_name")) { if (empty(data)) { data = get(row, "utf8_name"); //log.warn("{}: ASCII Name missing," + // " using UTF-8 version: '{}'", now(), data); } // We need a 'string' version, and a reversed thereof String string = data.toLowerCase(); doc.addField("basic_name_str", string); String rev = new StringBuffer(string).reverse().toString(); doc.addField("basic_name_rev", rev); } // Boost some countries if (countryIdsToBoost != null && key.equals("country_code")) { if (countryIdsToBoost.contains(data)) { boost *= 5; } } // Boost populated locations if (key.equals("feature_code")) { if (data.startsWith("PPL")) { boost *= 2; } } if (!empty(data)) { if (key.equals("alternate_names")) { String[] alternate_names = data.split(","); for (String alternate_name : alternate_names) { doc.addField(key, alternate_name); } } else { doc.addField(key, data); } } } // We are placing the boost on a field that all records have the same // value in. Then add 'AND boost:boost' to all queries. doc.addField("boost", "boost", boost); return doc; } /** * Shutdown function for cleaning up instantiated object. * */ public void shutdown() { if (reader != null) { try { reader.close(); } catch (IOException ex) { log.error("Error shutting down the Reader!", ex); } } if (solrContainer != null) { solrContainer.shutdown(); } } /** * Command line entry point. * * @param args: Array of String parameters from the command line */ public static void main(String[] args) { if (args.length == 0) { usage(); } // Eval input parameter for (String arg : args) { evalParam(arg); } // Validate mandatory parameter File file = new File(geonamesFileName); if (file == null || !file.exists()) { log.error("ERROR: The input file does not exist!"); usage(); return; } // Exclude or not alternate names List<String> columnsToExclude = new ArrayList<String>(); if (!withAlternateNames) { columnsToExclude.add("alternate_names"); } // Get ready to harvest Harvester harvester = null; try { harvester = new Harvester(file, countryIdsToBoost, columnsToExclude); } catch (Exception ex) { // A reason for death was logged in the constructor log.error("Stack trace: ", ex); } log.debug("\n\n===================\n\n"); // Tracking variables Date start = harvester.now(); int count = 0; // Run a single batch try { for (int i = 0; i < 500; i++) { int read = harvester.loop(BATCH_SIZE, false); count += read; log.info("{}: Rows read: {}", harvester.time(), count); // Commit after each batch try { harvester.commit(); } catch (Exception ex) { log.info("Commit failed: {}", harvester.time()); log.error("Stack trace: ", ex); } // Did we finish? if (read != BATCH_SIZE) { break; } } } catch (Exception ex) { log.error("ERROR: An error occurred in the processing loop: ", ex); } // Reporting Date finish = harvester.now(); float duration = (float) (finish.getTime() - start.getTime()) / (float) 1000; log.info("\n\nTotal time for execution: {}", duration); log.info("Total records processed: {}", count); if (count == 0) { log.info("Average records per second: 0"); } else { float speed = (float) count / (float) duration; log.info("Average records per second: {}", speed); } try { harvester.commit(); log.info("\n{}: Index optimize...", harvester.time()); harvester.optimize(); log.info("{}: ... completed", harvester.time()); } catch (Exception ex) { log.info("{}: ... failed", harvester.time()); log.error("Stack trace: ", ex); } log.info("\n\n===================\n\n"); harvester.shutdown(); } private static void usage() { StringBuffer msg = new StringBuffer(); msg.append("GeoNames solr harvester. Usage:\n"); msg.append(" harvest.sh [--withAlternateNames] [--countryIdsToBoost=AU,FR] geonames-dump.txt\n"); msg.append("\n"); msg.append(" geonames-dump.txt: input file to ingest (mandatory). This input file is \n"); msg.append(" expected to be a tab delimited geonames data dump\n"); msg.append(" --withAlternateNames: to load alternate names field\n"); msg.append(" --countryIdsToBoost: a comma separated list of country identifiers to boost\n"); log.info(msg.toString()); } private static void evalParam(String arg) { if (arg.startsWith("--withAlternateNames")) { withAlternateNames = true; } else if (arg.startsWith("--countryIdsToBoost")) { countryIdsToBoost = Arrays.asList(arg.split(",")); } else { geonamesFileName = arg; } } }