Java tutorial
/** * Linkipedia, Copyright (c) 2015 Tetherless World Constellation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package edu.rpi.tw.linkipedia.search.indexing; import java.io.BufferedReader; import java.io.DataInputStream; import java.io.File; import java.io.FileInputStream; import java.io.InputStreamReader; import java.util.HashMap; import java.util.HashSet; import java.util.Hashtable; import java.util.Map; import java.util.Set; import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper; import org.apache.lucene.analysis.payloads.FloatEncoder; import org.apache.lucene.analysis.payloads.PayloadEncoder; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FloatField; import org.apache.lucene.document.NumericDocValuesField; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Term; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import edu.rpi.tw.linkipedia.search.index.analyzer.EntropyAnalyzer; import edu.rpi.tw.linkipedia.search.similarity.MySimilarity; import edu.rpi.tw.linkipedia.search.utils.Utils; import edu.rpi.tw.linkipedia.search.indexing.DefaultAnalyzer; public class SurfaceFormIndexUpdater { String sourceDirectory; String indexDirectory; String currentLine; BufferedReader weightReader; Hashtable<String, Float> propertyWeight; public SurfaceFormIndexUpdater(String source, String destination) { sourceDirectory = source; indexDirectory = destination; propertyWeight = new Hashtable<String, Float>(); } public void updateIndex() { try { Analyzer stdAnalyzer = DefaultAnalyzer.getAnalyzer(); PayloadEncoder encoder = new FloatEncoder(); EntropyAnalyzer entropyAnalyzer = new EntropyAnalyzer(encoder); Map<String, Analyzer> myAnalyzerMap = new HashMap<String, Analyzer>(); myAnalyzerMap.put("label", entropyAnalyzer); myAnalyzerMap.put("analyzedLabel", stdAnalyzer); PerFieldAnalyzerWrapper MyAnalyzer = new PerFieldAnalyzerWrapper(stdAnalyzer, myAnalyzerMap); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_47, MyAnalyzer); iwc.setSimilarity(new MySimilarity()); iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); Directory dir = FSDirectory.open(new File(indexDirectory)); IndexWriter writer = new IndexWriter(dir, iwc); System.out.println("Indexing to directory '" + indexDirectory + "'..."); indexDocs(writer, new File(sourceDirectory)); System.out.println("Optimizing..."); writer.close(); System.out.println("Finished Indexing"); } catch (Exception e) { e.printStackTrace(); } } private void indexDocs(IndexWriter writer, File file) { if (file.canRead()) { if (file.isDirectory()) { String[] files = file.list(); if (files != null) { for (int i = 0; i < files.length; i++) { System.out.print(i + " "); indexDocs(writer, new File(file, files[i])); } } } else { System.out.println("adding " + file); try { FileInputStream fstream = new FileInputStream(file); DataInputStream in = new DataInputStream(fstream); BufferedReader br = new BufferedReader(new InputStreamReader(in)); String line; Set<String> labels = new HashSet<String>(); HashSet<String> defaultLabel = new HashSet<String>(); Set<String> lookUpLabels = new HashSet<String>(); String subject = ""; HashMap<String, Object> data = new HashMap<String, Object>(); int count = 0; while ((line = br.readLine()) != null) { String[] spo = line.split(" ", 3); if (spo.length < 3) { continue; } if (!(spo[0].startsWith("<http") || spo[0].startsWith("_:"))) continue; if (!(spo[1].startsWith("<http") || spo[1].startsWith("_:"))) continue; count++; if (!subject.equals(spo[0])) { if (!subject.equals("")) { data.put("url", subject); data.put("label", labels); data.put("defaultLabel", defaultLabel); data.put("lookUpLabel", lookUpLabels); System.out.println(count + " adding " + subject); Document doc = getDoc(data); writer.updateDocument(new Term("url", subject), doc); } subject = spo[0]; defaultLabel = new HashSet<String>(); labels = new HashSet<String>(); lookUpLabels = new HashSet<String>(); } if (spo[2].equals("")) { continue; } String property = spo[1].toLowerCase(); if (!property.contains("name") && !property.contains("label")) { continue; } /* if(spo[2].matches("u\\d+.*")){ continue; } */ spo[2] = spo[2].toLowerCase(); spo[2] = spo[2] + "|" + getPropertyWeight(spo[1]); if (spo[1].contains("urlName") || spo[1].contains("redirectName")) { defaultLabel.add(spo[2]); } labels.add(spo[2]); String removeSingles = Utils.removeSingleLetter(spo[2]); if (!removeSingles.equals(spo[0])) labels.add(removeSingles); addingLabels(lookUpLabels, spo[2]); //labels.add(reviseString(spo[2])); } //index last entity if (!subject.equals("")) { data.put("url", subject); data.put("label", labels); data.put("defaultLabel", defaultLabel); data.put("lookUpLabel", lookUpLabels); System.out.println(count + " adding " + subject); Document doc = getDoc(data); writer.updateDocument(new Term("url", subject), doc); } } catch (Exception e) { e.printStackTrace(); } } } } private float getPropertyWeight(String url) { if (propertyWeight.containsKey(url)) { return propertyWeight.get(url); } return 1; } public void setPropertyWeight(String weightFile) { try { System.out.println("weight file: " + weightFile + " end"); FileInputStream fstream = new FileInputStream(weightFile); DataInputStream in = new DataInputStream(fstream); BufferedReader weightReader = new BufferedReader(new InputStreamReader(in)); String line; while ((line = weightReader.readLine()) != null) { String[] weight = line.split(" ", 2); if (weight.length < 2) continue; propertyWeight.put(weight[0], Float.parseFloat(weight[1])); } in.close(); } catch (Exception e) { e.printStackTrace(); } } public void setEntityWeightFile(String weightFile) { try { System.out.println("weight file: " + weightFile + " end"); FileInputStream fstream = new FileInputStream(weightFile); DataInputStream in = new DataInputStream(fstream); weightReader = new BufferedReader(new InputStreamReader(in)); } catch (Exception e) { e.printStackTrace(); } } private float getEntityWeight(String url) { int counts = 1; try { if (currentLine != null) { String[] urlWeight = currentLine.split(" ", 2); if (urlWeight.length >= 2) { if (urlWeight[0].equals(url)) { float myWeight = Float.parseFloat(urlWeight[1]); System.out.println("found: " + url + " weight: " + myWeight); return myWeight; } int result = urlWeight[0].compareTo(url); if (result > 0) { return 1; } } } while ((currentLine = weightReader.readLine()) != null) { String[] urlWeight = currentLine.split(" ", 2); if (urlWeight.length < 2) continue; //System.out.println(urlWeight[0]+" vs "+url); int result = urlWeight[0].compareTo(url); if (result == 0) { float myWeight = Float.parseFloat(urlWeight[1]); System.out.println("found: " + url + " weight: " + myWeight); return myWeight; } else if (result > 0) { break; } } } catch (Exception e) { e.printStackTrace(); } float myWeight = (float) (1 + Math.log(counts)); System.out.println("not found: " + url + " weight: " + myWeight); return myWeight; } private void addingLabels(Set<String> hashSet, String label) { StringBuilder labelBuilder = new StringBuilder(); label = label.toLowerCase(); for (String label_part : label.split(" ")) { labelBuilder.append(label_part); hashSet.add(labelBuilder.toString()); labelBuilder.append(" "); } Set<String> noSingleLetter = new HashSet<String>(); for (String str : hashSet) { noSingleLetter.add(Utils.removeSingleLetter(str.replaceAll("\\|\\d+.*", ""))); } hashSet.addAll(noSingleLetter); } private Document getDoc(HashMap<String, Object> data) { Document doc = new Document(); doc.add(new StringField("url", (String) data.get("url"), Field.Store.YES)); //System.out.println("url:"+((String)data.get("url"))+"|"); HashSet<String> labels = (HashSet<String>) data.get("label"); HashSet<String> lookUpLabels = (HashSet<String>) data.get("lookUpLabel"); HashSet<String> defaultLabels = (HashSet<String>) data.get("defaultLabel"); for (String defaultLabel : defaultLabels) { doc.add(new TextField("defaultLabel", defaultLabel, Field.Store.YES)); } for (String label : labels) { //System.out.println((String)data.get("url")+" label "+label); doc.add(new TextField("label", label, Field.Store.YES)); //Analyzed using EntropyAnalyzer String[] analyzedLabel = label.split("\\|"); // System.out.println("analyzedlabel "+analyzedLabel[0]); doc.add(new TextField("analyzedLabel", analyzedLabel[0], Field.Store.NO)); //Analyzed using stdAnalyzer } for (String lookUpLabel : lookUpLabels) { //System.out.println(lookUpLabel); doc.add(new StringField("lookUpLabel", lookUpLabel, Field.Store.NO)); //not analyzed } //System.out.println("Updating... "+(String)data.get("url")); float weight = getEntityWeight((String) data.get("url")); doc.add(new FloatField("boost", weight, Field.Store.YES)); return doc; } }