Java tutorial
/* * This file is part of the LIRE project: http://www.semanticmetadata.net/lire * LIRE is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * LIRE is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with LIRE; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * We kindly ask you to refer the any or one of the following publications in * any publication mentioning or employing Lire: * * Lux Mathias, Savvas A. Chatzichristofis. Lire: Lucene Image Retrieval * An Extensible Java CBIR Library. In proceedings of the 16th ACM International * Conference on Multimedia, pp. 1085-1088, Vancouver, Canada, 2008 * URL: http://doi.acm.org/10.1145/1459359.1459577 * * Lux Mathias. Content Based Image Retrieval with LIRE. In proceedings of the * 19th ACM International Conference on Multimedia, pp. 735-738, Scottsdale, * Arizona, USA, 2011 * URL: http://dl.acm.org/citation.cfm?id=2072432 * * Mathias Lux, Oge Marques. Visual Information Retrieval using Java and LIRE * Morgan & Claypool, 2013 * URL: http://www.morganclaypool.com/doi/abs/10.2200/S00468ED1V01Y201301ICR025 * * Copyright statement: * -------------------- * (c) 2002-2013 by Mathias Lux (mathias@juggle.at) * http://www.semanticmetadata.net/lire, http://www.lire-project.net */ package net.semanticmetadata.lire.solr.indexing; import net.semanticmetadata.lire.imageanalysis.features.GlobalFeature; import net.semanticmetadata.lire.imageanalysis.features.global.*; import net.semanticmetadata.lire.indexers.hashing.BitSampling; import net.semanticmetadata.lire.indexers.hashing.MetricSpaces; import net.semanticmetadata.lire.solr.FeatureRegistry; import net.semanticmetadata.lire.solr.HashingMetricSpacesManager; import net.semanticmetadata.lire.utils.ImageUtils; import javax.imageio.ImageIO; import javax.json.*; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import javax.xml.transform.*; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; import java.awt.image.BufferedImage; import java.io.*; import java.nio.MappedByteBuffer; import java.nio.channels.FileChannel; import java.util.*; import java.util.concurrent.LinkedBlockingQueue; import org.apache.solr.client.solrj.SolrClient; import org.apache.solr.client.solrj.request.DirectXmlRequest; import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.client.solrj.impl.HttpSolrClient; import org.w3c.dom.Document; import org.xml.sax.SAXException; /** * This indexing application allows for parallel extraction of global features from multiple image files for * use with the LIRE Solr plugin. It basically takes a list of images (ie. created by something like * "dir /s /b > list.txt" or "ls [some parameters] > list.txt". * * use it like: * <pre>$> java -jar lire-request-handler.jar -i <infile> [-o <outfile>] [-n <threads>] [-m <max_side_length>] [-f]</pre> * * Available options are: * <ul> * <li> -i <infile> ... gives a file with a list of images to be indexed, one per line.</li> * <li> -o <outfile> ... gives XML file the output is written to. if none is given the outfile is <infile>.xml</li> * <li> -n <threads> ... gives the number of threads used for extraction. The number of cores is a good value for that.</li> * <li> -m <max-side-length> ... gives a maximum side length for extraction. This option is useful if very larger images are indexed.</li> * <li> -f ... forces to overwrite the <outfile>. If the <outfile> already exists and -f is not given, then the operation is aborted.</li> * <li> -p ... enables image processing before indexing (despeckle, trim white space)</li> * <li> -a ... use both BitSampling and MetricSpaces.</li> * <li> -l ... disables BitSampling and uses MetricSpaces instead.</li> * <li> -r ... defines a class implementing net.semanticmetadata.lire.solr.indexing.ImageDataProcessor that provides additional fields.</li> * </ul> * <p> * TODO: Make feature list change-able * </p> * You then basically need to enrich the file with whatever metadata you prefer and send it to Solr using for instance curl: * <pre>curl http://localhost:9000/solr/lire/update -H "Content-Type: text/xml" --data-binary @extracted_file.xml * curl http://localhost:9000/solr/lire/update -H "Content-Type: text/xml" --data-binary "<commit/>"</pre> * * @author Mathias Lux, mathias@juggle.at on 13.08.2013 */ public class ParallelSolrIndexer { private final static int maxCacheSize = 250; private final static int numberOfThreads = 8; private final static int monitoringInterval = 10; private final static int maxSideLength = 512; private final static boolean useMetricSpaces = false; private final static boolean useBitSampling = true; private LinkedBlockingQueue<Product> products = new LinkedBlockingQueue<>(maxCacheSize); private boolean ended = false; private int overallCount = 0; private OutputStream dos; private Set<Class> featuresSet; private File outfile; private String datasetPath; private List<String> gendersList; private List<String> categoriesList; private List<String> shopsList; private String gender, category; private List<String> previousProductsList; private List<Product> newProductsList; public ParallelSolrIndexer() { featuresSet = new HashSet<>(); featuresSet.add(CEDD.class); featuresSet.add(FCTH.class); featuresSet.add(JCD.class); featuresSet.add(AutoColorCorrelogram.class); HashingMetricSpacesManager.init(); // load reference points from disk. previousProductsList = new ArrayList<>(); newProductsList = new ArrayList<>(); } public static void main(String[] args) throws IOException { BitSampling.readHashFunctions(); ParallelSolrIndexer indexer = new ParallelSolrIndexer(); if (args.length < 4) { System.err.println( "Wrong number of arguments. 3 needed. Usage example:\n" + "home/path/to/dataset mujer all"); System.exit(1); } indexer.setDatasetPath(args[0]); indexer.configureGendersList(args[1]); if (indexer.gendersList.isEmpty()) { System.err.println("Invalid gender argument supplied"); System.exit(1); } indexer.configureCategoriesList(args[2]); if (indexer.categoriesList.isEmpty()) { System.err.println("Invalid category argument supplied"); System.exit(1); } indexer.configureShopsList(args[3]); if (indexer.shopsList.isEmpty()) { System.err.println("Invalid shop argument supplied"); System.exit(1); } for (String gender : indexer.gendersList) { indexer.gender = gender; for (String category : indexer.categoriesList) { System.out.println("Empezando con: " + category); indexer.category = category; String categoryFolderPath = indexer.datasetPath + "/" + gender + "/" + category; indexer.initOutfile(categoryFolderPath, category); indexer.newProductsList.clear(); for (String shop : indexer.shopsList) { indexer.configurePreviousProductsList(shop); indexer.configureNewProductsList(shop); } if (!indexer.previousProductsList.isEmpty()) { indexer.dos.write("<delete>\n".getBytes()); indexer.writeIdsToDelete(); indexer.dos.write("</delete>\n".getBytes()); } if (!indexer.newProductsList.isEmpty()) { indexer.dos.write("<add>\n".getBytes()); indexer.writeDocumentsToAdd(); indexer.dos.write("</add>\n".getBytes()); } indexer.closeOutfile(); if (!indexer.previousProductsList.isEmpty() || !indexer.newProductsList.isEmpty()) { indexer.postIndexToServer(); } } } } private void setDatasetPath(String datasetPath) { this.datasetPath = datasetPath; } private void configureGendersList(String genderArgument) { String[] allGenders = { "hombre", "mujer" }; gendersList = new ArrayList<>(); if (Arrays.asList(allGenders).contains(genderArgument)) { gendersList.add(genderArgument); } else { if (genderArgument.equals("all")) { gendersList.addAll(Arrays.asList(allGenders)); } } } private void configureCategoriesList(String categoryArgument) { String[] allCategories = { "abrigos_chaquetas", "camisas_blusas", "camisetas", "faldas", "monos", "pantalones_cortos", "pantalones_largos", "punto", "sudaderas_jerseis", "tops_bodies", "vestidos" }; categoriesList = new ArrayList<>(); if (Arrays.asList(allCategories).contains(categoryArgument)) { categoriesList.add(categoryArgument); } else { if (categoryArgument.equals("all")) { categoriesList.addAll(Arrays.asList(allCategories)); } } } private void configureShopsList(String shopArgument) { String[] allShops = { "asos", "laredoute", "mango", "pullandbear", "zara" }; shopsList = new ArrayList<>(); if (Arrays.asList(allShops).contains(shopArgument)) { shopsList.add(shopArgument); } else { if (shopArgument.equals("all")) { shopsList.addAll(Arrays.asList(allShops)); } } } private void configurePreviousProductsList(String shop) throws FileNotFoundException { String previousProductsFilePath = datasetPath + "/" + gender + "/" + category + "/" + shop + "/products" + "/previous_products.json"; previousProductsList.addAll(extractListFromJsonFile(previousProductsFilePath)); } private void configureNewProductsList(String shop) throws FileNotFoundException { String newProductsFilePath = datasetPath + "/" + gender + "/" + category + "/" + shop + "/products" + "/new_products.json"; List<String> newProductsIdsList = extractListFromJsonFile(newProductsFilePath); for (String productId : newProductsIdsList) { Product newProduct = new Product(productId, gender, category, shop); newProductsList.add(newProduct); } } private static List<String> extractListFromJsonFile(String filePath) throws FileNotFoundException { JsonReader newProductsJsonReader = Json.createReader(new FileReader(filePath)); JsonArray jsonArray = newProductsJsonReader.readArray(); newProductsJsonReader.close(); return jsonArray.getValuesAs(JsonString::getString); } private void initOutfile(String categoryFolderPath, String category) throws IOException { outfile = new File(categoryFolderPath + "/outfile_" + category + ".xml"); dos = new BufferedOutputStream(new FileOutputStream(outfile), 1024 * 1024 * 8); } private void closeOutfile() throws IOException { dos.close(); } private void writeIdsToDelete() throws IOException { for (String productId : previousProductsList) { dos.write(("<id>" + productId + "</id>\n").getBytes()); } } private void postIndexToServer() { try { SolrClient client = new HttpSolrClient.Builder( "http://54.93.254.52:8983/solr/" + gender + "_" + category).build(); //SolrClient client = new HttpSolrClient.Builder("http://54.93.254.52:8983/solr/prueba").build(); DocumentBuilderFactory dbfac = DocumentBuilderFactory.newInstance(); DocumentBuilder docBuilder = dbfac.newDocumentBuilder(); Document doc = docBuilder.parse(outfile); TransformerFactory tf = TransformerFactory.newInstance(); Transformer transformer = tf.newTransformer(); transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); StringWriter writer = new StringWriter(); transformer.transform(new DOMSource(doc), new StreamResult(writer)); String xmlOutput = writer.getBuffer().toString().replaceAll("[\n\r]", ""); DirectXmlRequest xmlreq = new DirectXmlRequest("/update", xmlOutput); client.request(xmlreq); client.commit(); } catch (SolrServerException | IOException | TransformerException | SAXException | ParserConfigurationException e) { e.printStackTrace(); } } public static String arrayToString(int[] array) { StringBuilder sb = new StringBuilder(array.length * 8); for (int i = 0; i < array.length; i++) { if (i > 0) sb.append(' '); sb.append(Integer.toHexString(array[i])); } return sb.toString(); } private void writeDocumentsToAdd() { System.out.println("Extracting features: "); for (Class listOfFeature : featuresSet) { System.out.println("\t" + listOfFeature.getCanonicalName()); } try { ended = false; overallCount = 0; Thread p = new Thread(new Producer(), "Producer"); p.start(); LinkedList<Thread> threads = new LinkedList<>(); long l = System.currentTimeMillis(); for (int i = 0; i < numberOfThreads; i++) { Thread c = new Thread(new Consumer(), "Consumer-" + i); c.start(); threads.add(c); } Thread m = new Thread(new Monitoring(), "Monitoring"); m.start(); for (Thread thread : threads) { thread.join(); } p.join(); m.join(); long l1 = System.currentTimeMillis() - l; System.out.println("Analyzed " + overallCount + " images in " + l1 / 1000 + " seconds, ~" + (overallCount > 0 ? (l1 / overallCount) : "inf.") + " ms each."); } catch (Exception e) { e.printStackTrace(); } } class Product { String productId; String gender; String category; String shop; String price; byte[] buffer; Product() { productId = null; } Product(String productId, String gender, String category, String shop) throws FileNotFoundException { this.productId = productId; this.gender = gender; this.category = category; this.shop = shop; generateInfoFromJson(); } String getCroppedImagePath() { return datasetPath + "/" + this.gender + "/" + this.category + "/CROPPED/" + productId + "_CROPPED.png"; } void generateInfoFromJson() throws FileNotFoundException { String jsonFilePath = datasetPath + "/" + this.gender + "/" + this.category + "/" + shop + "/products/" + productId + "/" + productId + ".json"; JsonReader jsonReader = Json.createReader(new FileReader(jsonFilePath)); JsonObject jsonObject = jsonReader.readObject(); jsonReader.close(); price = jsonObject.getString("price"); } void setBuffer(byte[] buffer) { this.buffer = buffer; } } class Monitoring implements Runnable { public void run() { long ms = System.currentTimeMillis(); try { Thread.sleep(1000 * monitoringInterval); } catch (InterruptedException e) { e.printStackTrace(); } while (!ended) { try { long time = System.currentTimeMillis() - ms; System.out.println("Analyzed " + overallCount + " images in " + time / 1000 + " seconds, " + ((overallCount > 0) ? (time / overallCount) : "n.a.") + " ms each (" + products.size() + " images currently in queue)."); Thread.sleep(1000 * monitoringInterval); // wait xx seconds } catch (InterruptedException e) { e.printStackTrace(); } } } } class Producer implements Runnable { public void run() { for (Product next : newProductsList) { try { File croppedImage = new File(next.getCroppedImagePath()); // reading from harddrive to buffer to reduce the load on the HDD and move decoding to the // consumers using java.nio int fileSize = (int) croppedImage.length(); byte[] buffer = new byte[fileSize]; FileInputStream fis = new FileInputStream(croppedImage); FileChannel channel = fis.getChannel(); MappedByteBuffer map = channel.map(FileChannel.MapMode.READ_ONLY, 0, fileSize); map.load(); map.get(buffer); Product product = new Product(next.productId, next.gender, next.category, next.shop); product.setBuffer(buffer); products.put(product); } catch (Exception e) { System.err .println("Could not read image " + next.getCroppedImagePath() + ": " + e.getMessage()); } } for (int i = 0; i < numberOfThreads; i++) { try { products.put(new Product()); } catch (InterruptedException e) { e.printStackTrace(); } } ended = true; } } class Consumer implements Runnable { Product tmp = null; LinkedList<GlobalFeature> features = new LinkedList<>(); int count = 0; boolean locallyEnded = false; StringBuilder sb = new StringBuilder(1024); Consumer() { addFeatures(); } private void addFeatures() { for (Class next : featuresSet) { try { features.add((GlobalFeature) next.newInstance()); } catch (InstantiationException | IllegalAccessException e) { e.printStackTrace(); } } } public void run() { while (!locallyEnded) { try { if (!locallyEnded) { tmp = products.take(); if (tmp.productId == null) locallyEnded = true; else { count++; overallCount++; } } if (!locallyEnded) { sb.delete(0, sb.length()); ByteArrayInputStream b = new ByteArrayInputStream(tmp.buffer); // reads the image. Make sure twelve monkeys lib is in the path to read all jpegs and tiffs. BufferedImage read = ImageIO.read(b); // converts color space to INT_RGB BufferedImage img = ImageUtils.createWorkingCopy(read); img = ImageUtils.trimWhiteSpace(img); // trims white space if (maxSideLength > 50) img = ImageUtils.scaleImage(img, maxSideLength); // scales image to 512 max sidelength. else if (img.getWidth() < 32 || img.getHeight() < 32) { // image is too small to be worked with, for now I just do an upscale. double scaleFactor = 128d; if (img.getWidth() > img.getHeight()) { scaleFactor = (128d / (double) img.getWidth()); } else { scaleFactor = (128d / (double) img.getHeight()); } img = ImageUtils.scaleImage(img, ((int) (scaleFactor * img.getWidth())), (int) (scaleFactor * img.getHeight())); } // --------< creating doc >------------------------- String imageId = tmp.productId; sb.append("<doc>"); sb.append("<field name=\"id\">"); sb.append(imageId); sb.append("</field>"); sb.append("<field name=\"shop\">"); sb.append(tmp.shop); sb.append("</field>"); sb.append("<field name=\"price\">"); sb.append(tmp.price); sb.append("</field>"); for (GlobalFeature feature : features) { String featureCode = FeatureRegistry.getCodeForClass(feature.getClass()); if (featureCode != null) { feature.extract(img); String histogramField = FeatureRegistry.codeToFeatureField(featureCode); String hashesField = FeatureRegistry.codeToHashField(featureCode); String metricSpacesField = FeatureRegistry.codeToMetricSpacesField(featureCode); sb.append("<field name=\"").append(histogramField).append("\">"); sb.append(Base64.getEncoder().encodeToString(feature.getByteArrayRepresentation())); sb.append("</field>"); if (useBitSampling) { sb.append("<field name=\"").append(hashesField).append("\">"); sb.append( arrayToString(BitSampling.generateHashes(feature.getFeatureVector()))); sb.append("</field>"); } if (useMetricSpaces && MetricSpaces.supportsFeature(feature)) { sb.append("<field name=\"").append(metricSpacesField).append("\">"); sb.append(MetricSpaces.generateHashString(feature)); sb.append("</field>"); } } } sb.append("</doc>\n"); // --------< / creating doc >------------------------- // finally write everything to the stream - in case no exception was thrown.. synchronized (dos) { dos.write(sb.toString().getBytes()); } } } catch (Exception e) { System.err.println("Error processing file " + tmp.getCroppedImagePath()); e.printStackTrace(); } } } } }