at.lux.fotoretrieval.retrievalengines.LuceneRetrievalEngine.java Source code

Java tutorial

Introduction

Here is the source code for at.lux.fotoretrieval.retrievalengines.LuceneRetrievalEngine.java

Source

/*
 * This file is part of Caliph & Emir.
 *
 * Caliph & Emir is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * Caliph & Emir is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Caliph & Emir; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 * Copyright statement:
 * --------------------
 * (c) 2002-2005 by Mathias Lux (mathias@juggle.at)
 * http://www.juggle.at, http://caliph-emir.sourceforge.net
 */
package at.lux.fotoretrieval.retrievalengines;

import at.lux.components.StatusBar;
import at.lux.fotoretrieval.FileOperations;
import at.lux.fotoretrieval.ResultListEntry;
import at.lux.fotoretrieval.RetrievalToolkit;
import at.lux.fotoretrieval.lucene.Graph;
import at.lux.fotoretrieval.lucene.Node;
import at.lux.fotoretrieval.lucene.Relation;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.jdom.Element;
import org.jdom.JDOMException;
import org.jdom.Namespace;
import org.jdom.input.SAXBuilder;
import org.jdom.output.Format;
import org.jdom.output.XMLOutputter;

import javax.swing.*;
import java.io.*;
import java.text.DecimalFormat;
import java.text.NumberFormat;
import java.util.*;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;

/**
 * Date: 13.10.2004
 * Time: 21:47:58
 *
 * @author Mathias Lux, mathias@juggle.at
 */
public class LuceneRetrievalEngine extends AbstractRetrievalEngine {
    private int maxResults = 40;
    private static Namespace xsi = Namespace.getNamespace("xsi", "http://www.w3.org/2001/XMLSchema-instance");
    public static final HashMap<String, String> relationMapping;

    public LuceneRetrievalEngine(int maxResults) {
        this.maxResults = maxResults;
    }

    static {
        relationMapping = new HashMap<String, String>(27);
        relationMapping.put("key", "keyFor");
        relationMapping.put("annotates", "annotatedBy");
        relationMapping.put("shows", "appearsIn");
        relationMapping.put("references", "referencedBy");
        relationMapping.put("quality", "qualityOf");
        relationMapping.put("symbolizes", "symbolizedBy");
        relationMapping.put("location", "locationOf");
        relationMapping.put("source", "sourceOf");
        relationMapping.put("destination", "destinationOf");
        relationMapping.put("path", "pathOf");
        relationMapping.put("time", "timeOf");
        relationMapping.put("depicts", "depictedBy");
        relationMapping.put("represents", "representedBy");
        relationMapping.put("context", "contextFor");
        relationMapping.put("interprets", "interpretedBy");
        relationMapping.put("agent", "agentOf");
        relationMapping.put("patient", "patientOf");
        relationMapping.put("experiencer", "experiencerOf");
        relationMapping.put("stimulus", "stimulusOf");
        relationMapping.put("causer", "causerOf");
        relationMapping.put("goal", "goalOf");
        relationMapping.put("beneficiary", "beneficiaryOf");
        relationMapping.put("theme", "themeOf");
        relationMapping.put("result", "resultOf");
        relationMapping.put("instrument", "instrumentOf");
        relationMapping.put("accompanier", "accompanierOf");
        relationMapping.put("summarizes", "summarizedBt");
        relationMapping.put("specializes", "generalizes");
        relationMapping.put("exemplifies", "exemplifiedBy");
        relationMapping.put("part", "partOf");
        relationMapping.put("property", "propertyOf");
        relationMapping.put("user", "userOf");
        relationMapping.put("component", "componentOf");
        relationMapping.put("substance", "substanceOf");
        relationMapping.put("entails", "entailedBy");
        relationMapping.put("manner", "mannerOf");
        relationMapping.put("state", "stateOf");
        relationMapping.put("influences", "dependsOn");
    }

    /**
     * In this case we can search for images with a String query deinfing
     * a graph, where the nodes are build by search queries in square brackets
     * and are referenced in relations by their postion starting with number 1.
     * relations follow the nodes starting with the type followed by the
     * position of the source node in the List and the target node:
     * <code>
     * query := node+ relation+
     * node := [term+]
     * relation := type source target
     * term := String
     * type := String
     * source := Integer
     * target := Integer
     * </code>
     * e.g."[Mathias Lux] [Talking] agentOf 1 2"
     *
     * @param xPath
     * @param objects       can be set to <code>null</code>
     * @param whereToSearch
     * @param recursive
     * @param progress
     */
    public List<ResultListEntry> getImagesBySemantics(String xPath, Vector objects, String whereToSearch,
            boolean recursive, JProgressBar progress) {
        List<String> nodeQueries = new LinkedList<String>();
        StringTokenizer st = new StringTokenizer(xPath, "]");
        String relationString = "";
        List<Relation> relations = new LinkedList<Relation>();
        while (st.hasMoreTokens()) {
            String s = st.nextToken().trim();
            if (s.startsWith("[")) {
                s = s.substring(1);
                nodeQueries.add(s);
            } else {
                relationString = s;
            }
        }
        if (relationString.length() > 1) {
            // there are relations, go ahead and parse them:
            StringTokenizer sr = new StringTokenizer(relationString);
            Relation currentRelation = null;
            while (sr.hasMoreTokens()) {
                String s = sr.nextToken();
                try {
                    int i = Integer.parseInt(s);
                    if (currentRelation.getSource() < 0) {
                        currentRelation.setSource(i);
                    } else if (currentRelation.getTarget() < 0) {
                        currentRelation.setTarget(i);
                        currentRelation.eliminateInverse();
                        relations.add(currentRelation);
                        currentRelation = null;
                    }
                } catch (NumberFormatException e) {
                    // its the type :)
                    currentRelation = new Relation(-1, -1, s.trim());
                }
            }
        }

        // so for now do the retrieval for the nodes:
        int numOfNodes = nodeQueries.size();
        List<List<Node>> nodeResults = new LinkedList<List<Node>>();

        for (int i = 0; i < numOfNodes; i++) {
            String queryString = nodeQueries.get(i);
            List<Node> nodes = getNodes(queryString, whereToSearch);
            nodeResults.add(nodes);
        }

        // now we can expand our query on retrieved nodes:
        List<Graph> graphList = getExpandedGraphsFromResults(nodeResults, relations, 3);
        LinkedList<ResultListEntry> results = new LinkedList<ResultListEntry>();
        if (progress != null) {
            progress.setMinimum(0);
            progress.setMaximum(graphList.size());
            progress.setValue(0);
            progress.setString("Querying expanded graphs");
        }
        int countGraph = 0;
        for (Iterator<Graph> iterator = graphList.iterator(); iterator.hasNext();) {
            Graph graph = iterator.next();
            results.addAll(searchForGraph(graph, whereToSearch));
            countGraph++;
            if (progress != null)
                progress.setValue(countGraph);
        }

        // for now eliminate the doublettes
        if (progress != null) {
            progress.setMinimum(0);
            progress.setMaximum(results.size());
            progress.setValue(0);
            progress.setString("Removing double entries");
        }
        countGraph = 0;
        HashMap<String, ResultListEntry> gegencheck = new HashMap<String, ResultListEntry>();
        for (Iterator<ResultListEntry> iterator = results.iterator(); iterator.hasNext();) {
            ResultListEntry resultListEntry = iterator.next();
            String file = resultListEntry.getFilePath();
            double relevance = resultListEntry.getRelevance();
            if (gegencheck.containsKey(file)) {
                double rel = gegencheck.get(file).getRelevance();
                if (rel < relevance) {
                    gegencheck.put(file, resultListEntry);
                }
            } else {
                gegencheck.put(file, resultListEntry);
            }
            countGraph++;
            if (progress != null)
                progress.setValue(countGraph);
        }
        results.clear();
        results.addAll(gegencheck.values());
        Collections.sort(results);
        return results;
    }

    private List<Graph> getExpandedGraphsFromResults(List<List<Node>> nodeResults, List<Relation> relations,
            int depth) {
        List<List<Node>> expanded = getExpandedSets(nodeResults, depth);
        //        System.out.println("Expanding to " + expanded.size() + " graphs");
        List<Graph> results = new LinkedList<Graph>();
        for (Iterator<List<Node>> iterator = expanded.iterator(); iterator.hasNext();) {
            List<Node> nodes = iterator.next();
            Graph g = getGraphFromResults(nodes, relations);
            results.add(g);
        }
        // if there are any relations without type we have to
        // create a reverse relation each, otherwise we won't
        // get all our results:
        //        List<Graph> additionalResults = new LinkedList<Graph>();
        //        for (Iterator<Graph> iterator = results.iterator(); iterator.hasNext();) {
        //            Graph graph = iterator.next();
        //            expandUntypedRelations(graph, additionalResults);
        //        }
        return results;
    }

    private List<List<Node>> getExpandedSets(List<List<Node>> nodeResults, int depth) {
        if (nodeResults.size() > 1) {
            List<Node> firstNodesResults = nodeResults.get(0);
            int numLevels = 0;
            for (Iterator<Node> iterator = firstNodesResults.iterator(); iterator.hasNext();) {
                Node node = iterator.next();
                if (node.getWeight() < 1f)
                    break;
                numLevels++;
            }
            numLevels += depth;
            if (firstNodesResults.size() < depth) {
                numLevels = firstNodesResults.size();
            }
            List<List<Node>> tmpNodeResults = new LinkedList<List<Node>>(nodeResults);
            tmpNodeResults.remove(0);
            List<List<Node>> results = getExpandedSets(tmpNodeResults, depth);
            List<List<Node>> endResult = new LinkedList<List<Node>>();
            for (int i = 0; i < numLevels && i < firstNodesResults.size(); i++) {
                for (int j = 0; j < results.size(); j++) {
                    List<Node> nodeList = new LinkedList<Node>(results.get(j));
                    nodeList.add(0, firstNodesResults.get(i));
                    endResult.add(nodeList);
                }
            }
            return endResult;
        } else {
            List<List<Node>> endResult = new LinkedList<List<Node>>();
            List<Node> firstNodesResults = nodeResults.get(0);
            int numLevels = 0;
            for (Iterator<Node> iterator = firstNodesResults.iterator(); iterator.hasNext();) {
                Node node = iterator.next();
                if (node.getWeight() < 1f)
                    break;
                numLevels++;
            }
            numLevels += depth;
            for (int i = 0; i < numLevels && i < firstNodesResults.size(); i++) {
                List<Node> nodeList = new LinkedList<Node>();
                nodeList.add(firstNodesResults.get(i));
                endResult.add(nodeList);
            }
            return endResult;
        }
    }

    private Graph getGraphFromResults(List<Node> nodeResults, List<Relation> relations) {
        HashMap<Integer, Integer> idReplacementTable = new HashMap<Integer, Integer>(nodeResults.size());
        List<Node> nodes = new LinkedList<Node>();
        List<Relation> myRelations = new LinkedList<Relation>();
        for (int i = 0; i < nodeResults.size(); i++) {
            Node node = nodeResults.get(i);
            idReplacementTable.put(i + 1, node.getNodeID());
            nodes.add(node);
        }
        // Create the relations with the real IDs:
        for (Iterator<Relation> iterator = relations.iterator(); iterator.hasNext();) {
            Relation r = iterator.next();
            int src = (idReplacementTable.get(r.getSource()));
            int tgt = (idReplacementTable.get(r.getTarget()));
            myRelations.add(new Relation(src, tgt, r.getType()));
        }
        // now we can create the graph we want to search for:
        Graph g = new Graph(nodes, myRelations);
        return g;
    }

    private List<ResultListEntry> searchForGraph(Graph g, String whereToSearch) {
        //        System.out.println("Querying for graph: " + g.toString());
        //        for (Iterator<Node> iterator = g.getNodes().iterator(); iterator.hasNext();) {
        //            Node node = iterator.next();
        //            System.out.println(node.getLabel() + ": " + node.getNodeID() + " (" + node.getWeight() + ") ");
        //        }
        // and we search for it ion the text file:
        String indexFile;

        // create regex string:
        // as there are all nodes and relations surrounded with square brackets this is easy
        // between the relations there may various literals: '.*'
        String regexInsert = ".*";
        StringBuilder graphSearch = new StringBuilder(g.toString().length() * 2);
        StringTokenizer stok = new StringTokenizer(g.toString(), "[");
        String s = "";
        graphSearch.append(regexInsert);
        LinkedList<String> graphSearchList = new LinkedList<String>();
        while (stok.hasMoreTokens()) {
            StringBuilder regexItem = new StringBuilder(32);
            s = stok.nextToken().trim();
            s = s.substring(0, s.length() - 1);

            // and there may be other nodes & relations:
            regexItem.append(regexInsert);
            // opening bracket '['
            regexItem.append("\\x5B");
            // the actual content (node or relation)
            regexItem.append(s);
            // closing bracket ']'
            regexItem.append("\\x5D");
            // and there may be other nodes & relations:
            regexItem.append(regexInsert);

            String regex = regexItem.toString();

            if (regex.indexOf("\\w*") > -1) {
                // here we create support for relation wildcards:
                regex = expandUntypedRelation(regex);
            }

            graphSearchList.add(regex);

            /*
                        // opening bracket '['
                        graphSearch.append("\\x5B");
                        // the actual content (node or relation)
                        graphSearch.append(s);
                        // closing bracket ']'
                        graphSearch.append("\\x5D");
                        // and there may be other nodes & relations:
                        graphSearch.append(regexInsert);
            */
        }

        List<ResultListEntry> resultList = new LinkedList<ResultListEntry>();
        SAXBuilder builder = new SAXBuilder();

        if (!whereToSearch.endsWith(File.separator)) {
            indexFile = whereToSearch + File.separator + "idx_graphs.list";
        } else {
            indexFile = whereToSearch + "idx_graphs.list";
        }
        try {
            BufferedReader br = new BufferedReader(
                    new InputStreamReader(new GZIPInputStream(new FileInputStream(indexFile))));
            String line = null;
            //            String regex = graphSearch.toString();
            //            String oldRegex = regex;
            //            if (regex.indexOf("\\w*") > -1) {
            //                 here we create support for relation wildcards:
            //                regex = expandUntypedRelation(regex);
            //
            //            }
            //            System.out.println("REGEX: " + regex);
            while ((line = br.readLine()) != null) {
                boolean match = true;
                for (Iterator<String> iterator = graphSearchList.iterator(); iterator.hasNext();) {
                    String regex = iterator.next();
                    if (!line.matches(regex)) {
                        match = false;
                        continue;
                    }
                }
                if (match) {
                    // we found a graph:
                    System.out.println("FOUND: " + line);
                    StringTokenizer st = new StringTokenizer(line, "|");
                    String graphString = st.nextToken();
                    Graph theGraph = new Graph(graphString);
                    float similarity = g.getMcsSimilarity(theGraph);
                    while (st.hasMoreTokens()) {
                        String fileName = st.nextToken();
                        Element e = builder.build(new FileInputStream(fileName)).getRootElement();
                        ResultListEntry entry = new ResultListEntry((double) similarity, e, fileName);
                        resultList.add(entry);
                    }
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        return resultList;
    }

    private String expandUntypedRelation(String regex) {
        String behind = regex.substring(regex.indexOf("\\w*") + 4);
        String before = regex.substring(0, regex.indexOf("\\w*") + 4);

        String firstNum = behind.substring(0, behind.indexOf(' '));
        String secondNum = behind.substring(behind.indexOf(' ') + 1, behind.indexOf('\\'));

        behind = behind.substring(behind.indexOf('\\'));

        regex = before + "((" + firstNum + " " + secondNum + ")|(" + secondNum + " " + firstNum + "))" + behind;
        return regex;
    }

    /**
     * Not implemented ... please use the method of the engine
     * {@link at.lux.fotoretrieval.retrievalengines.FileSystemRetrieval}
     * @param VisualDescriptor
     * @param whereToSearch
     * @param recursive
     * @param progress
     */
    public List<ResultListEntry> getSimilarImages(Element VisualDescriptor, String whereToSearch, boolean recursive,
            JProgressBar progress) {
        return null;
    }

    public List<ResultListEntry> getSimilarImages_fromSet(Set<Element> VisualDescriptorSet, String whereToSearch,
            boolean recursive, JProgressBar progress) {
        return null;
    }

    public List<ResultListEntry> getImagesByXPathSearch(String xPath, String whereToSearch, boolean recursive,
            JProgressBar progress) {
        ArrayList<ResultListEntry> results = new ArrayList<ResultListEntry>(maxResults);
        if (progress != null)
            progress.setString("Searching through index");
        SAXBuilder builder = new SAXBuilder();
        try {
            QueryParser qParse = new QueryParser("all", new StandardAnalyzer());
            IndexSearcher searcher = new IndexSearcher(parseFulltextIndexDirectory(whereToSearch));
            Query query = qParse.parse(xPath);
            Hits hits = searcher.search(query);
            int hitsCount = hits.length();
            if (hitsCount > maxResults)
                hitsCount = maxResults;
            if (progress != null) {
                progress.setMinimum(0);
                progress.setMaximum(hitsCount);
                progress.setValue(0);
                progress.setString("Reading results from disk");
            }

            for (int i = 0; i < hitsCount; i++) {
                Document d = hits.doc(i);
                Element e = builder.build(new FileInputStream(d.get("file"))).getRootElement();
                results.add(new ResultListEntry(hits.score(i), e, d.get("file")));
                if (progress != null)
                    progress.setValue(i);
            }

        } catch (IOException e) {
            e.printStackTrace();
        } catch (ParseException e) {
            System.err.println("XPath was: " + xPath);
            e.printStackTrace();
        } catch (JDOMException e) {
            e.printStackTrace();
        }
        return results;

    }

    /**
     * In general we take the base path for our search for the pathToIndex parameter.
     * we then add the directory "index" and create it there.
     *
     * @param pathToIndex
     * @param statusBar
     */
    public void indexFiles(String pathToIndex, StatusBar statusBar) {
        // parsing and eventually creating the directory for the index ...
        String indexDir = parseFulltextIndexDirectory(pathToIndex);

        Analyzer analyzer = new StandardAnalyzer();
        boolean createFlag = true;
        SAXBuilder builder = new SAXBuilder();
        String prefix = "Creating fulltext index: ";
        try {
            IndexWriter writer = new IndexWriter(indexDir, analyzer, createFlag);
            String[] descriptions = FileOperations.getAllDescriptions(new File(pathToIndex), true);
            if (descriptions == null)
                return;
            float numAllDocsPercent = (float) descriptions.length / 100f;
            DecimalFormat df = (DecimalFormat) NumberFormat.getInstance();
            df.setMaximumFractionDigits(1);

            for (int i = 0; i < descriptions.length; i++) {
                try {
                    Element e = builder.build(new FileInputStream(descriptions[i])).getRootElement();
                    Document idxDocument = new Document();
                    // adding the file itself ...
                    idxDocument.add(new Field("file", descriptions[i], Field.Store.YES, Field.Index.NO));
                    // adding all given names
                    StringBuilder all = new StringBuilder(255);

                    List l = RetrievalToolkit.xpathQuery(e, "//Graph/Relation", null);
                    //                    System.out.println("NumberOfRelations: " + l.size());

                    addToDocument(idxDocument, e, "//Agent/Name/GivenName", "GivenName", all);
                    addToDocument(idxDocument, e, "//Agent/Name/FamilyName", "FamilyName", all);
                    addToDocument(idxDocument, e, "//Label/Name", "Label", all);
                    addToDocument(idxDocument, e, "//FreeTextAnnotation", "FreeTextAnnotation", all);
                    addToDocument(idxDocument, e, "//StructuredAnnotation/Who/Name", "Who", all);
                    addToDocument(idxDocument, e, "//StructuredAnnotation/Where/Name", "Where", all);
                    addToDocument(idxDocument, e, "//StructuredAnnotation/How/Name", "How", all);
                    addToDocument(idxDocument, e, "//StructuredAnnotation/Why/Name", "Why", all);
                    addToDocument(idxDocument, e, "//StructuredAnnotation/When/Name", "When", all);
                    addToDocument(idxDocument, e, "//StructuredAnnotation/WhatObject/Name", "WhatObjects", all);
                    addToDocument(idxDocument, e, "//StructuredAnnotation/WhatAction/Name", "WhatAction", all);

                    idxDocument.add(new Field("all", all.toString(), Field.Store.NO, Field.Index.TOKENIZED));

                    writer.addDocument(idxDocument);

                    if (statusBar != null) {
                        StringBuilder status = new StringBuilder(13).append(prefix);
                        status.append(df.format(((float) i) / numAllDocsPercent));
                        status.append('%');
                        statusBar.setStatus(status.toString());
                    }

                } catch (Exception e1) {
                    System.err.println("Error with file " + descriptions[i] + " (" + e1.getMessage() + ")");
                }
            }
            writer.optimize();
            writer.close();
            if (statusBar != null) {
                statusBar.setStatus("Indexing finished");
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    /**
     * Creates a path from the base directory to a index directory for storing
     * the fulltext index
     *
     * @param pathToIndex directory where the index dir should be created
     * @return path to index directory for with Lucene
     */
    public static String parseFulltextIndexDirectory(String pathToIndex) {
        String indexDir = pathToIndex;
        if (!indexDir.endsWith(System.getProperty("file.separator")))
            indexDir += System.getProperty("file.separator");
        indexDir += "idx_fulltext";
        File indexDirFile = new File(indexDir);
        if (!indexDirFile.exists())
            indexDirFile.mkdir();
        return indexDir;
    }

    /**
     * Creates a path from the base directory to a index directory for storing
     * the index of semantic objects
     *
     * @param pathToIndex directory where the index dir should be created
     * @return path to index directory for with Lucene
     */
    public static String parseSemanticIndexDirectory(String pathToIndex) {
        String indexDir = pathToIndex;
        if (!indexDir.endsWith(System.getProperty("file.separator")))
            indexDir += System.getProperty("file.separator");
        indexDir += "idx_semantic";
        File indexDirFile = new File(indexDir);
        if (!indexDirFile.exists())
            indexDirFile.mkdir();
        return indexDir;
    }

    private void addToDocument(Document document, Element root, String xPath, String fieldName,
            StringBuilder allContents) {
        List l = RetrievalToolkit.xpathQuery(root, xPath, null);
        StringWriter sw = new StringWriter(128);
        for (Iterator iterator = l.iterator(); iterator.hasNext();) {
            Element e = (Element) iterator.next();
            sw.append(e.getTextTrim());
            sw.append(" ");
            allContents.append(e.getTextTrim());
            allContents.append(" ");
        }
        document.add(new Field(fieldName, sw.toString(), Field.Store.YES, Field.Index.TOKENIZED));
    }

    public void indexFilesSemantically(String pathToIndex, StatusBar statusBar) {
        if (statusBar != null)
            statusBar.setStatus("Creating index from semantic annotations");

        SAXBuilder builder = new SAXBuilder();
        XMLOutputter outputter = new XMLOutputter(
                Format.getRawFormat().setIndent("").setLineSeparator("").setExpandEmptyElements(false));

        try {
            String[] descriptions = FileOperations.getAllDescriptions(new File(pathToIndex), true);
            if (descriptions == null)
                return;
            float numAllDocsPercent = (float) descriptions.length / 100f;
            DecimalFormat df = (DecimalFormat) NumberFormat.getInstance();
            df.setMaximumFractionDigits(1);

            // Preparing objects for the index:
            HashMap<String, ElementEntry> elementMap = new HashMap<String, ElementEntry>(descriptions.length);
            HashMap<Element, LinkedList<String>> element2document = new HashMap<Element, LinkedList<String>>(
                    descriptions.length);

            // in the first run we identify the semantic objects that we want to index and build
            // a table were we can relate them to the documents (identified by their path)
            for (int i = 0; i < descriptions.length; i++) {
                try {
                    Element e = builder.build(new FileInputStream(descriptions[i])).getRootElement();
                    List l = RetrievalToolkit.xpathQuery(e, "//Semantic/SemanticBase", null);
                    for (Iterator iterator = l.iterator(); iterator.hasNext();) {
                        Element semanticElement = (Element) iterator.next();
                        String xmlString = outputter.outputString(semanticElement).trim()
                                .replaceAll("id=\"id_[0-9]*\"", "");
                        // check if element is already there, indicator is its string representation.
                        if (!elementMap.keySet().contains(xmlString)) {
                            // its not here, put it in.
                            elementMap.put(xmlString, new ElementEntry(semanticElement, elementMap.size()));
                            //                            System.out.println(xmlString);
                        }
                        // now get the unified element
                        semanticElement = elementMap.get(xmlString).semanticElement;
                        // and check if there is an entry in the table for where to find the element
                        if (!element2document.keySet().contains(semanticElement)) {
                            element2document.put(semanticElement, new LinkedList<String>());
                        }
                        // and add found document if not already there:
                        List documentList = element2document.get(semanticElement);
                        if (!documentList.contains(descriptions[i]))
                            documentList.add(descriptions[i]);
                    }
                    if (statusBar != null)
                        statusBar.setStatus(
                                "Parsing documents for nodes: " + df.format((float) i / numAllDocsPercent));
                } catch (JDOMException e1) {
                    System.err.println("Exception in document #" + i + ": " + e1.getMessage());
                } catch (IOException e1) {
                    e1.printStackTrace();
                }
            }
            // read stats:
            // System.out.println("Got " + countOverallElements + " Elements in " + descriptions.length + " descriptions, " + elementMap.size() + " elements are pairwise different.");

            // Now we can add the nodes to a lucene index:
            // fields: label, id, type, files (separated by '|'), xml, all
            // -------------------------------------------

            // opening the index for writing:
            boolean createFlag = true;
            String indexDir = parseSemanticIndexDirectory(pathToIndex);
            Analyzer analyzer = new StandardAnalyzer();
            IndexWriter writer = new IndexWriter(indexDir, analyzer, createFlag);

            if (statusBar != null)
                statusBar.setStatus("Creating index for " + element2document.size() + " different available nodes");

            // iterating through nodes and storing them:
            for (Iterator<Element> iterator = element2document.keySet().iterator(); iterator.hasNext();) {
                Element semElement = iterator.next();
                // needed for later XPath :( otherwise everthing in the whole document is retrieved.

                String fileList = getFileListFromNode(element2document.get(semElement));
                Document idxDocument = new Document();
                // adding the file itself ...
                idxDocument.add(new Field("files", fileList, Field.Store.YES, Field.Index.NO));

                //                System.out.println(((Element) o).getTextTrim());

                StringBuilder all = new StringBuilder(255);
                // adding the label
                //                addToDocument(idxDocument, semElement, "//Label/Name", "label", all);
                String elementLabel = semElement.getChild("Label", semElement.getNamespace())
                        .getChildTextTrim("Name", semElement.getNamespace());
                idxDocument.add(new Field("label", elementLabel, Field.Store.YES, Field.Index.TOKENIZED));

                // adding the type:
                String elementType = semElement.getAttribute("type", xsi).getValue().trim();
                idxDocument.add(new Field("type", elementType, Field.Store.YES, Field.Index.NO));
                // adding the XML contents:
                String xmlString = outputter.outputString(semElement);
                idxDocument.add(new Field("xml", xmlString, Field.Store.YES, Field.Index.NO));
                // adding the id:
                idxDocument.add(
                        new Field("id", elementMap.get(xmlString.trim().replaceAll("id=\"id_[0-9]*\"", "")).id + "",
                                Field.Store.YES, Field.Index.NO));
                // adding all, unstored for retrieval only
                List l = RetrievalToolkit.xpathQuery(semElement, "*//*", null);
                for (Iterator it3 = l.iterator(); it3.hasNext();) {
                    Element e = (Element) it3.next();
                    all.append(e.getTextTrim());
                    all.append(" ");
                }
                idxDocument.add(new Field("all", all.toString(), Field.Store.NO, Field.Index.TOKENIZED));

                writer.addDocument(idxDocument);

            }
            // now optimize and close the index:
            // todo: open index for appending and/or updating
            writer.optimize();
            writer.close();

            // Now we can create the powerset for each existing graph
            // (based on sorted node ids) and store
            // all resulting graphs within an index.
            // ----------------------------------------------------------
            if (statusBar != null)
                statusBar.setStatus("Creating and merging powersets of available graphs");
            HashMap<Graph, HashSet<String>> graph2document = new HashMap<Graph, HashSet<String>>(
                    descriptions.length);
            for (int i = 0; i < descriptions.length; i++) {
                try {
                    Element e = builder.build(new FileInputStream(descriptions[i])).getRootElement();
                    List l = RetrievalToolkit.xpathQuery(e, "//Semantic/SemanticBase", null);
                    HashMap<String, Integer> docID2overallID = new HashMap<String, Integer>(l.size());
                    LinkedList<Relation> relations = new LinkedList<Relation>();
                    LinkedList<Integer> nodes = new LinkedList<Integer>();
                    for (Iterator iterator = l.iterator(); iterator.hasNext();) {
                        Element semanticElement = (Element) iterator.next();
                        String xmlString = outputter.outputString(semanticElement);
                        int id = elementMap.get(xmlString.trim().replaceAll("id=\"id_[0-9]*\"", "")).id;
                        String docID = semanticElement.getAttribute("id").getValue();
                        docID2overallID.put(docID, id);
                        nodes.add(id);
                    }
                    // get all relations with global ids and eliminate inverse relations
                    l = RetrievalToolkit.xpathQuery(e, "//Graph/Relation", null);
                    for (Iterator iterator = l.iterator(); iterator.hasNext();) {
                        Element relation = (Element) iterator.next();
                        int source = docID2overallID.get(relation.getAttribute("source").getValue().substring(1));
                        int target = docID2overallID.get(relation.getAttribute("target").getValue().substring(1));
                        String type = relation.getAttribute("type").getValue();
                        type = type.substring(type.lastIndexOf(':') + 1);
                        Relation r = eliminateInverse(new Relation(source, target, type));
                        relations.add(r);
                    }

                    // now create a graph object
                    Collections.sort(nodes);
                    Collections.sort(relations);
                    LinkedList<Node> nodeList = new LinkedList<Node>();
                    for (Iterator<Integer> iterator = nodes.iterator(); iterator.hasNext();) {
                        nodeList.add(new Node(iterator.next()));
                    }
                    Graph g = new Graph(nodeList, relations);
                    //                    List<Graph> powerSet = new LinkedList<Graph>();
                    //                    powerSet.add(g);
                    HashSet<String> docs = new HashSet<String>(1);
                    docs.add(descriptions[i]);
                    graph2document.put(g, docs);
                    /*
                        
                                        // add all these subgraphs and the reference to the document to
                                        // a data structure:
                                        for (Iterator<Graph> iterator = powerSet.iterator(); iterator.hasNext();) {
                    Graph graph = iterator.next();
                    //                        List<Graph> relationsPowerSet = graph.getPowerSetOfRelations();
                    //                        for (Iterator<Graph> iterator1 = relationsPowerSet.iterator(); iterator1.hasNext();) {
                    //                            Graph graph1 = iterator1.next();
                    //                        }
                    // add graph if not trivial:
                    if (graph.getNodes().size() > 1) {
                        // containsKey for Graph does not match my needs -
                        // different graph objects reference the same graph!
                        if (string2graph.containsKey(graph.toString())) {
                            graph = string2graph.get(graph.toString());
                            graph2document.get(graph).add(descriptions[i]);
                        } else {
                            HashSet<String> docs = new HashSet<String>(1);
                            docs.add(descriptions[i]);
                            graph2document.put(graph, docs);
                        }
                    }
                                        }
                    */
                } catch (JDOMException e1) {
                    System.err.println("Exception in document #" + i + ": " + e1.getMessage());
                }
            }

            HashMap<String, Graph> str2graph = new HashMap<String, Graph>(graph2document.size() / 2);
            HashMap<Graph, HashSet<String>> g2d = new HashMap<Graph, HashSet<String>>(descriptions.length);

            /*
            For now we reduce the number of graphs by identifiying and merging duplicates and
            remove redundant entries:
            */
            for (Iterator<Graph> iterator = graph2document.keySet().iterator(); iterator.hasNext();) {
                Graph g = iterator.next();
                if (str2graph.containsKey(g.toString())) {
                    g2d.get(str2graph.get(g.toString())).addAll(graph2document.get(g));
                } else {
                    str2graph.put(g.toString(), g);
                    g2d.put(g, graph2document.get(g));
                }
            }
            graph2document = g2d;
            System.out.println(graph2document.size() + " non trivial different graphs were found");
            // now put all the available graphs into an index:
            // -----------------------------------------------
            // todo: create real fast storable index of subgraphs instead of file :-) possible candidate a trie

            // for now we will store a simple text file:
            if (statusBar != null)
                statusBar.setStatus("Storing powersets of available graphs as file");
            String indexFile;
            if (!pathToIndex.endsWith(File.separator)) {
                indexFile = pathToIndex + File.separator + "idx_graphs.list";
            } else {
                indexFile = pathToIndex + "idx_graphs.list";
            }
            File f = new File(indexFile);
            BufferedWriter bw = new BufferedWriter(
                    new OutputStreamWriter(new GZIPOutputStream(new FileOutputStream(f, false))));
            for (Iterator<Graph> iterator = graph2document.keySet().iterator(); iterator.hasNext();) {
                Graph g = iterator.next();
                bw.write(g.toString());
                for (Iterator<String> iterator1 = graph2document.get(g).iterator(); iterator1.hasNext();) {
                    String s = iterator1.next();
                    bw.write("|" + s);
                }
                bw.write("\n");
            }
            bw.close();
        } catch (IOException e) {
            e.printStackTrace();
        }

    }

    /**
     * Searches for all available nodes with given query String
     *
     * @param queryString   query like "Mathias Lux" or some text inside a node.
     * @param whereToSearch defines the base directory for the search
     * @return a List of Matching nodes with their associated weights
     */
    public static List<Node> getNodes(String queryString, String whereToSearch) {
        return ((LucenePathIndexRetrievalEngine) RetrievalEngineFactory.getPathIndexRetrievalEngine())
                .getNodes(queryString, whereToSearch);
    }

    private String getFileListFromNode(List<String> list) {
        StringBuilder files = new StringBuilder(64);
        for (Iterator<String> it2 = list.iterator(); it2.hasNext();) {
            files.append(it2.next());
            if (it2.hasNext()) {
                files.append('|');
            }
        }
        return files.toString();
    }

    /**
     * Eliminates all inverse relations to simplify retrieval
     *
     * @param relation
     * @return the normalized relation
     */
    public static Relation eliminateInverse(Relation relation) {
        Relation result = relation;
        if (relationMapping.containsKey(relation.getType())) {
            result = new Relation(relation.getTarget(), relation.getSource(),
                    relationMapping.get(relation.getType()));
        }
        return result;
    }

}