Java tutorial
/******************************************************************************* * Copyright 2013 Robert Ying, based on code by Ernesto Tapias * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ package ocr; import math.*; import java.util.*; import weka.core.*; import weka.filters.unsupervised.attribute.*; import weka.filters.*; import java.io.*; public class ARFFSymbolFilter { static boolean accumulatedAngle; static double alpha; static boolean areaPoly2RelativeToBoundingBox; static boolean aspect; static boolean centerOfGravity; static boolean cluster; static boolean costurning; static boolean costurningchange; static boolean curliness; static boolean dehook; static boolean equidistant; static boolean filter; static boolean firstLastPointsDistanceRelativeToLength; static boolean getCurvature; static boolean getCurvature2; static boolean getX; static boolean getY; static boolean globalCos; static boolean globalSin; static boolean labelOut; static boolean length; static boolean lengthPosition; static boolean linearity; static int na; static int nc; static boolean normalizedDirection; static int np; static int npeq; static int ns; static boolean orderStrokes; static boolean proyectionPolygonal; static boolean random; static boolean relativeFirstLastPointDistance; static boolean relativeHeight; static boolean relativeLengthFirstLastPointDistance; static boolean relativeWidth; static Random rnd; static boolean scale; static double sigma; static boolean sinturning; static boolean sinturningchange; static boolean smooth; static boolean strokenumber; static boolean sumDistanceError; static boolean tangent; static int times; static { ARFFSymbolFilter.alpha = 0.15; ARFFSymbolFilter.na = -1; ARFFSymbolFilter.np = 24; ARFFSymbolFilter.npeq = 32; ARFFSymbolFilter.nc = 80; ARFFSymbolFilter.ns = -1; ARFFSymbolFilter.times = 1; ARFFSymbolFilter.labelOut = false; ARFFSymbolFilter.random = false; ARFFSymbolFilter.sigma = 0.15; ARFFSymbolFilter.rnd = new Random(); ARFFSymbolFilter.centerOfGravity = false; ARFFSymbolFilter.accumulatedAngle = true; ARFFSymbolFilter.areaPoly2RelativeToBoundingBox = true; ARFFSymbolFilter.aspect = true; ARFFSymbolFilter.curliness = true; ARFFSymbolFilter.firstLastPointsDistanceRelativeToLength = true; ARFFSymbolFilter.getCurvature = false; ARFFSymbolFilter.getCurvature2 = true; ARFFSymbolFilter.relativeWidth = false; ARFFSymbolFilter.relativeHeight = false; ARFFSymbolFilter.globalCos = false; ARFFSymbolFilter.globalSin = false; ARFFSymbolFilter.length = true; ARFFSymbolFilter.linearity = true; ARFFSymbolFilter.relativeFirstLastPointDistance = false; ARFFSymbolFilter.relativeLengthFirstLastPointDistance = false; ARFFSymbolFilter.sumDistanceError = false; ARFFSymbolFilter.smooth = true; ARFFSymbolFilter.proyectionPolygonal = false; ARFFSymbolFilter.dehook = true; ARFFSymbolFilter.equidistant = true; ARFFSymbolFilter.cluster = true; ARFFSymbolFilter.tangent = true; ARFFSymbolFilter.filter = false; ARFFSymbolFilter.normalizedDirection = true; ARFFSymbolFilter.orderStrokes = true; ARFFSymbolFilter.scale = true; ARFFSymbolFilter.getX = true; ARFFSymbolFilter.getY = true; ARFFSymbolFilter.lengthPosition = false; ARFFSymbolFilter.costurning = true; ARFFSymbolFilter.sinturning = true; ARFFSymbolFilter.costurningchange = true; ARFFSymbolFilter.sinturningchange = true; ARFFSymbolFilter.strokenumber = true; } public static FastVector constructAttributes(final int n) { final FastVector fv = new FastVector(); boolean oldds = false; for (int j = 0; j < n; ++j) { if (ARFFSymbolFilter.centerOfGravity) { fv.addElement(new Attribute("centerOfGravityX")); } if (ARFFSymbolFilter.centerOfGravity) { fv.addElement(new Attribute("centerOfGravityY")); } if (ARFFSymbolFilter.accumulatedAngle) { fv.addElement(new Attribute("accumulatedAngle")); } if (ARFFSymbolFilter.areaPoly2RelativeToBoundingBox) { fv.addElement(new Attribute("areaPoly2RelativeToBoundingBox")); } if (ARFFSymbolFilter.aspect) { fv.addElement(new Attribute("aspect")); } if (ARFFSymbolFilter.curliness) { fv.addElement(new Attribute("curliness")); } if (ARFFSymbolFilter.firstLastPointsDistanceRelativeToLength) { fv.addElement(new Attribute("firstLastPointsDistanceRelativeToLength")); } if (ARFFSymbolFilter.getCurvature) { fv.addElement(new Attribute("getCurvature")); } if (ARFFSymbolFilter.getCurvature2) { fv.addElement(new Attribute("getCurvature2")); } if (ARFFSymbolFilter.relativeWidth) { fv.addElement(new Attribute("relativeWidth")); } if (ARFFSymbolFilter.relativeHeight) { fv.addElement(new Attribute("relativeHeight")); } if (ARFFSymbolFilter.globalCos) { fv.addElement(new Attribute("globalCos")); } if (ARFFSymbolFilter.globalSin) { fv.addElement(new Attribute("globalSin")); } if (ARFFSymbolFilter.length) { fv.addElement(new Attribute("length")); } if (ARFFSymbolFilter.linearity) { fv.addElement(new Attribute("linearity")); } if (ARFFSymbolFilter.relativeFirstLastPointDistance) { fv.addElement(new Attribute("relativeFirstLastPointDistance")); } if (ARFFSymbolFilter.relativeLengthFirstLastPointDistance) { fv.addElement(new Attribute("relativeLengthFirstLastPointDistance")); } if (ARFFSymbolFilter.sumDistanceError) { fv.addElement(new Attribute("sumDistanceError")); } if (oldds) { fv.addElement(new Attribute("ds.first().distance(oldds.first())")); fv.addElement(new Attribute("ds.first().distance(oldds.last())")); fv.addElement(new Attribute("ds.last().distance(oldds.first())")); fv.addElement(new Attribute("ds.last().distance(oldds.last())")); } for (int i = 0; i < ARFFSymbolFilter.np; ++i) { if (ARFFSymbolFilter.getX) { fv.addElement(new Attribute("strokeAt(" + j + ").getX(" + i + ")")); } if (ARFFSymbolFilter.getY) { fv.addElement(new Attribute("strokeAt(" + j + ").getY(" + i + ")")); } if (ARFFSymbolFilter.lengthPosition && i != 0 && i != ARFFSymbolFilter.np - 1) { fv.addElement(new Attribute("strokeAt(" + j + ").lengthPosition(" + i + ")")); } if (i < ARFFSymbolFilter.np - 1) { if (ARFFSymbolFilter.costurning) { fv.addElement(new Attribute("strokeAt(" + j + ").cos(a[" + i + "])")); } if (ARFFSymbolFilter.sinturning) { fv.addElement(new Attribute("strokeAt(" + j + ").sin(a[" + i + "])")); } } if (i < ARFFSymbolFilter.np - 2) { if (ARFFSymbolFilter.costurningchange) { fv.addElement(new Attribute("strokeAt(" + j + ").cos(a[" + (i + 1) + "]-a[" + i + "])")); } if (ARFFSymbolFilter.sinturningchange) { fv.addElement(new Attribute("strokeAt(" + j + ").sin(a[" + (i + 1) + "]-a[" + i + "])")); } } } oldds = true; } fv.addElement(new Attribute("class", (FastVector) null)); return fv; } public static Instance constructInstance(final Symbol s, final double alph) { final StringTokenizer st = new StringTokenizer(constructStringInstance(s, alph), " "); ARFFSymbolFilter.na = st.countTokens() / 2; if (ARFFSymbolFilter.na == 0) { return null; } final Instance instance = new Instance(ARFFSymbolFilter.na + 1); for (int i = 0; i < ARFFSymbolFilter.na; ++i) { st.nextToken(); instance.setValue(i, Double.parseDouble(st.nextToken())); } return instance; } public static String constructStringInstance(final Symbol s, final double alph) { String str = null; Symbol newSymbol = new Symbol(); newSymbol = new Symbol(); for (int i = 0; i < s.size(); ++i) { DStroke ds = new DStroke(s.strokeAt(i)); if (ds.getWidth() < 5.0 && ds.getHeight() < 5.0 && ds.size() < 3) { return null; } try { if (ARFFSymbolFilter.smooth) { ds = ds.smooth(); } if (ARFFSymbolFilter.proyectionPolygonal) { ds = ds.proyectionPolygonal(ARFFSymbolFilter.np); } if (ARFFSymbolFilter.cluster) { ds = ds.clusterPoints(1.0 / ARFFSymbolFilter.nc); } if (ARFFSymbolFilter.equidistant) { ds = ds.equidistantLength(ARFFSymbolFilter.np); } if (ARFFSymbolFilter.dehook) { ds = ds.getDehooked(); if (ds.size() < ARFFSymbolFilter.np) { if (ARFFSymbolFilter.equidistant) { ds = ds.equidistantLength(ARFFSymbolFilter.np); } else { ds = ds.proyectionPolygonal(ARFFSymbolFilter.np); } } } } catch (ArrayIndexOutOfBoundsException ex) { return null; } newSymbol.add(ds); } if (alph != 0.0 && ARFFSymbolFilter.tangent) { newSymbol.tangentTansformation(alph); } if (ARFFSymbolFilter.filter) { newSymbol.filter(); } if (ARFFSymbolFilter.normalizedDirection) { newSymbol.normalizeDirection(); } if (ARFFSymbolFilter.orderStrokes) { newSymbol.orderStrokes(); } if (ARFFSymbolFilter.scale) { newSymbol.scale(); } str = ""; DStroke oldds = null; for (int j = 0; j < newSymbol.size(); ++j) { final DStroke ds = newSymbol.strokeAt(j); if (ARFFSymbolFilter.centerOfGravity) { str = str + "centerOfGravityX " + (float) ds.centerOfGravity().x + " "; } if (ARFFSymbolFilter.centerOfGravity) { str = str + "centerOfGravityY " + (float) ds.centerOfGravity().y + " "; } if (ARFFSymbolFilter.accumulatedAngle) { str = str + "accumulatedAngle " + (float) ds.accumulatedAngle() + " "; } if (ARFFSymbolFilter.areaPoly2RelativeToBoundingBox) { str = str + "areaPoly2RelativeToBoundingBox " + (float) ds.areaPoly2RelativeToBoundingBox() + " "; } if (ARFFSymbolFilter.aspect) { str = str + "aspect " + (float) ds.aspect() + " "; } if (ARFFSymbolFilter.curliness) { str = str + "curliness " + (float) ds.curliness() + " "; } if (ARFFSymbolFilter.firstLastPointsDistanceRelativeToLength) { str = str + "firstLastPointsDistanceRelativeToLength " + (float) ds.firstLastPointsDistanceRelativeToLength() + " "; } if (ARFFSymbolFilter.getCurvature) { str = str + "getCurvature " + (float) ds.getCurvature() + " "; } if (ARFFSymbolFilter.getCurvature2) { str = str + "getCurvature2 " + (float) ds.getCurvature2() + " "; } if (ARFFSymbolFilter.relativeWidth) { str = str + "relativeWidth " + (float) ds.getWidth() / Math.max(ds.getWidth(), ds.getHeight()) + " "; } if (ARFFSymbolFilter.relativeHeight) { str = str + "relativeHeight " + (float) ds.getHeight() / Math.max(ds.getWidth(), ds.getHeight()) + " "; } if (ARFFSymbolFilter.globalCos) { str = str + "globalCos " + (float) ds.globalCos() + " "; } if (ARFFSymbolFilter.globalSin) { str = str + "globalSin " + (float) ds.globalSin() + " "; } if (ARFFSymbolFilter.length) { str = str + "length " + (float) ds.length() + " "; } if (ARFFSymbolFilter.linearity) { str = str + "linearity " + (float) ds.linearity() + " "; } if (ARFFSymbolFilter.relativeFirstLastPointDistance) { str = str + "relativeFirstLastPointDistance " + (float) ds.relativeFirstLastPointDistance() + " "; } if (ARFFSymbolFilter.relativeLengthFirstLastPointDistance) { str = str + "relativeLengthFirstLastPointDistance " + (float) ds.relativeLengthFirstLastPointDistance() + " "; } if (ARFFSymbolFilter.sumDistanceError) { str = str + "sumDistanceError " + (float) ds.sumDistanceError() + " "; } if (oldds != null) { str = str + "ds.first().distance(oldds.first()) " + (float) ds.first().distance(oldds.first()) + " "; str = str + "ds.first().distance(oldds.last()) " + (float) ds.first().distance(oldds.last()) + " "; str = str + "ds.last().distance(oldds.first()) " + (float) ds.last().distance(oldds.first()) + " "; str = str + "ds.last().distance(oldds.last()) " + (float) ds.last().distance(oldds.last()) + " "; } double d; double lp = d = ((0.0)); final double[] a = ds.turningFunction(); for (int i = 0; i < ds.size(); ++i) { if (ARFFSymbolFilter.getX) { str = str + "strokeAt(" + j + ").getX(" + i + ") " + (float) ds.getX(i) + " "; } if (ARFFSymbolFilter.getY) { str = str + "strokeAt(" + j + ").getY(" + i + ") " + (float) ds.getY(i) + " "; } lp += d; if (ARFFSymbolFilter.lengthPosition && i != 0 && i != ds.size() - 1) { str = str + "strokeAt(" + j + ").lengthPosition(" + i + ") " + (float) lp + " "; } if (ARFFSymbolFilter.linearity) { Geom.height(ds.first(), ds.last(), ds.pointAt(i)); } if (i < ds.size() - 1) { d = ds.length(i, i + 1); final double cos1 = Math.cos(a[i]); final double sin1 = Math.sin(a[i]); if (ARFFSymbolFilter.costurning) { str = str + "strokeAt(" + j + ").cos(a[" + i + "]) " + (float) cos1 + " "; } if (ARFFSymbolFilter.sinturning) { str = str + "strokeAt(" + j + ").sin(a[" + i + "]) " + (float) sin1 + " "; } } if (i < ds.size() - 2) { final double cos2 = Math.cos(a[i + 1] - a[i]); final double sin2 = Math.sin(a[i + 1] - a[i]); final double diffcos = cos2; final double diffsin = sin2; if (ARFFSymbolFilter.costurningchange) { str = str + "strokeAt(" + j + ").cos(a[" + (i + 1) + "]-a[" + i + "]) " + (float) diffcos + " "; } if (ARFFSymbolFilter.sinturningchange) { str = str + "strokeAt(" + j + ").sin(a[" + (i + 1) + "]-a[" + i + "]) " + (float) diffsin + " "; } } } oldds = ds; } return str; } public static String constructStringSparseVector(final Symbol s, final double alph) { String str = ""; final StringTokenizer st = new StringTokenizer(constructStringInstance(s, alph), " "); final int na = st.countTokens() / 2; if (na == 0) { return null; } for (int i = 0; i < na; ++i) { st.nextToken(); str = str + " " + i + ":" + st.nextToken(); } str = s.name + str; return str; } public static String getProcessedSingle(final Symbol s) { return getProcessedSingle(s, 0.0); } public static String getProcessedSingle(final Symbol s, final double alph) { Symbol newSymbol = new Symbol(); newSymbol = new Symbol(); for (int i = 0; i < s.size(); ++i) { DStroke ds = new DStroke(s.strokeAt(i)); if (ds.getWidth() < 5.0 && ds.getHeight() < 5.0 && ds.size() < 3) { return ""; } try { if (ARFFSymbolFilter.smooth) { ds = ds.smooth(); } if (ARFFSymbolFilter.proyectionPolygonal) { ds = ds.proyectionPolygonal(ARFFSymbolFilter.np); } if (ARFFSymbolFilter.cluster) { ds = ds.clusterPoints(1.0 / ARFFSymbolFilter.nc); } if (ARFFSymbolFilter.dehook) { ds = ds.getDehooked(); } if (ARFFSymbolFilter.equidistant) { ds = ds.equidistantLength(ARFFSymbolFilter.np); } } catch (ArrayIndexOutOfBoundsException ex) { return ""; } newSymbol.add(ds); } if (alph != 0.0 && ARFFSymbolFilter.tangent) { newSymbol.tangentTansformation(alph); } if (ARFFSymbolFilter.filter) { newSymbol.filter(); } if (ARFFSymbolFilter.normalizedDirection) { newSymbol.normalizeDirection(); } if (ARFFSymbolFilter.orderStrokes) { newSymbol.orderStrokes(); } if (ARFFSymbolFilter.scale) { newSymbol.scale(); } String str = ""; int count = 0; DStroke oldds = null; for (int j = 0; j < newSymbol.size(); ++j) { final DStroke ds = newSymbol.strokeAt(j); if (ARFFSymbolFilter.centerOfGravity) { str = str + "" + count++ + ":" + (float) ds.centerOfGravity().x + " "; } if (ARFFSymbolFilter.centerOfGravity) { str = str + "" + count++ + ":" + (float) ds.centerOfGravity().y + " "; } if (ARFFSymbolFilter.accumulatedAngle) { str = str + "" + count++ + ":" + (float) ds.accumulatedAngle() + " "; } if (ARFFSymbolFilter.areaPoly2RelativeToBoundingBox) { str = str + "" + count++ + ":" + (float) ds.areaPoly2RelativeToBoundingBox() + " "; } if (ARFFSymbolFilter.aspect) { str = str + "" + count++ + ":" + (float) ds.aspect() + " "; } if (ARFFSymbolFilter.curliness) { str = str + "" + count++ + ":" + (float) ds.curliness() + " "; } if (ARFFSymbolFilter.firstLastPointsDistanceRelativeToLength) { str = str + "" + count++ + ":" + (float) ds.firstLastPointsDistanceRelativeToLength() + " "; } if (ARFFSymbolFilter.getCurvature) { str = str + "" + count++ + ":" + (float) ds.getCurvature() + " "; } if (ARFFSymbolFilter.getCurvature2) { str = str + "" + count++ + ":" + (float) ds.getCurvature2() + " "; } if (ARFFSymbolFilter.relativeWidth) { str = str + "" + count++ + ":" + (float) ds.getWidth() / Math.max(ds.getWidth(), ds.getHeight()) + " "; } if (ARFFSymbolFilter.relativeHeight) { str = str + "" + count++ + ":" + (float) ds.getHeight() / Math.max(ds.getWidth(), ds.getHeight()) + " "; } if (ARFFSymbolFilter.globalCos) { str = str + "" + count++ + ":" + (float) ds.globalCos() + " "; } if (ARFFSymbolFilter.globalSin) { str = str + "" + count++ + ":" + (float) ds.globalSin() + " "; } if (ARFFSymbolFilter.length) { str = str + "" + count++ + ":" + (float) ds.length() + " "; } if (ARFFSymbolFilter.linearity) { str = str + "" + count++ + ":" + (float) ds.linearity() + " "; } if (ARFFSymbolFilter.relativeFirstLastPointDistance) { str = str + "" + count++ + ":" + (float) ds.relativeFirstLastPointDistance() + " "; } if (ARFFSymbolFilter.relativeLengthFirstLastPointDistance) { str = str + "" + count++ + ":" + (float) ds.relativeLengthFirstLastPointDistance() + " "; } if (ARFFSymbolFilter.sumDistanceError) { str = str + "" + count++ + ":" + (float) ds.sumDistanceError() + " "; } if (oldds != null) { str = str + "" + count++ + ":" + (float) ds.first().distance(oldds.first()) + " "; str = str + "" + count++ + ":" + (float) ds.first().distance(oldds.last()) + " "; str = str + "" + count++ + ":" + (float) ds.last().distance(oldds.first()) + " "; str = str + "" + count++ + ":" + (float) ds.last().distance(oldds.last()) + " "; } double d; double lp = d = ((0.0)); final double[] a = ds.turningFunction(); for (int i = 0; i < ds.size(); ++i) { if (ARFFSymbolFilter.getX) { str = str + "" + count++ + ":" + (float) ds.getX(i) + " "; } if (ARFFSymbolFilter.getY) { str = str + "" + count++ + ":" + (float) ds.getY(i) + " "; } lp += d; if (ARFFSymbolFilter.lengthPosition && i != 0 && i != ds.size() - 1) { str = str + "" + count++ + ":" + (float) lp + " "; } if (ARFFSymbolFilter.linearity) { Geom.height(ds.first(), ds.last(), ds.pointAt(i)); } if (i < ds.size() - 1) { d = ds.length(i, i + 1); final double cos1 = Math.cos(a[i]); final double sin1 = Math.sin(a[i]); if (ARFFSymbolFilter.costurning) { str = str + "" + count++ + ":" + (float) cos1 + " "; } if (ARFFSymbolFilter.sinturning) { str = str + "" + count++ + ":" + (float) sin1 + " "; } } if (i < ds.size() - 2) { final double cos2 = Math.cos(a[i + 1] - a[i]); final double sin2 = Math.sin(a[i + 1] - a[i]); final double diffcos = cos2; final double diffsin = sin2; if (ARFFSymbolFilter.costurningchange) { str = str + "" + count++ + ":" + (float) diffcos + " "; } if (ARFFSymbolFilter.sinturningchange) { str = str + "" + count++ + ":" + (float) diffsin + " "; } } } oldds = ds; } return str; } public static String oldReplace(final String aInput, final String aOldPattern, final String aNewPattern) { final StringBuffer result = new StringBuffer(); int startIdx = 0; for (int idxOld = 0; (idxOld = aInput.indexOf(aOldPattern, startIdx)) >= 0; startIdx = idxOld + aOldPattern.length()) { result.append(aInput.substring(startIdx, idxOld)); result.append(aNewPattern); } result.append(aInput.substring(startIdx)); return result.toString(); } public static ArrayList<ArrayList<Symbol>> read(BufferedReader filein) { StringTokenizer st; String line; ArrayList<ArrayList<Symbol>> symbolData = new ArrayList<ArrayList<Symbol>>(); Symbol sym; ArrayList<Symbol> group; int nsim, i, count; try { while ((line = filein.readLine()) != null) { st = new StringTokenizer(line, " \n\t\r\f"); count = st.countTokens(); line = ""; for (i = 0; i < count - 1; i++) { line += st.nextToken() + " "; } nsim = Integer.parseInt(st.nextToken()); group = new ArrayList<Symbol>(); for (i = 0; i < nsim; i++) { sym = Symbol.readSymbol(filein); System.out.print("."); if (sym.size() == 0 || sym.name.equals("no_name")) { continue; } ns = Math.max(ns, sym.size()); group.add(new Symbol(sym)); } symbolData.add(group); } System.out.println("."); } catch (IOException ioe) { ioe.printStackTrace(); } return symbolData; } public static ArrayList<ArrayList<Symbol>> read(final String filenamein) { BufferedReader filein = null; ArrayList<ArrayList<Symbol>> v = new ArrayList<ArrayList<Symbol>>(); try { filein = new BufferedReader(new FileReader(filenamein)); System.out.println("Reading file " + filenamein); v = read(filein); } catch (IOException ioe) { System.out.println("Error reading " + filenamein + "."); ioe.printStackTrace(); } return v; } public static ArrayList<?> readOld(final String filenamein) { BufferedReader filein = null; ArrayList<?> v = new ArrayList<Object>(); try { filein = new BufferedReader(new FileReader(filenamein)); System.out.println("Reading file " + filenamein); v = readOld(filein); } catch (IOException ioe) { System.out.println("Error reading " + filenamein + "."); ioe.printStackTrace(); } return v; } public static ArrayList<ArrayList<Symbol>> readOld(final BufferedReader filein) throws IOException { final ArrayList<String> symbolNames = new ArrayList<String>(); final ArrayList<ArrayList<Symbol>> symbolData = new ArrayList<ArrayList<Symbol>>(); final String line; if ((line = filein.readLine()) == null) { System.err.println("ERROR: Empty file..."); return symbolData; } StringTokenizer st = new StringTokenizer(line, " \n\t\r\f"); int size = st.countTokens(); if (size == 0) { System.err.println("ERROR: Empty line..."); return symbolData; } st.nextToken(); final int numGroups = Integer.parseInt(st.nextToken()); st = new StringTokenizer(filein.readLine(), " \n\t\r\f"); size = st.countTokens(); for (int i = 0; i < size; ++i) { symbolNames.add(st.nextToken()); } for (int j = 0; j < numGroups; ++j) { final ArrayList<Symbol> actualGroup = new ArrayList<Symbol>(); st = new StringTokenizer(filein.readLine(), " \n\t\r\f"); st.nextToken(); for (int k = 0; k < size; ++k) { final Symbol actualSymbol = new Symbol(); actualSymbol.read(filein); actualSymbol.name = new String(symbolNames.get(k)); System.out.print("."); if (actualSymbol.size() != 0) { if (!actualSymbol.name.equals("no_name")) { actualGroup.add(actualSymbol); ARFFSymbolFilter.ns = Math.max(ARFFSymbolFilter.ns, actualSymbol.size()); } } } symbolData.add(actualGroup); } System.out.println("."); return symbolData; } public static void write(final String filenameout, final ArrayList<?> symbolData) { final int nsold = ARFFSymbolFilter.ns; ARFFSymbolFilter.tangent = (ARFFSymbolFilter.times > 1); try { if (!ARFFSymbolFilter.strokenumber) { ARFFSymbolFilter.ns = 1; } final DataOutputStream[] fileout = new DataOutputStream[ARFFSymbolFilter.ns]; System.out.println("Writing file"); for (int i = 0; i < ARFFSymbolFilter.ns; ++i) { final int k = ARFFSymbolFilter.strokenumber ? i : (nsold - 1); fileout[ARFFSymbolFilter.strokenumber ? i : 0] = new DataOutputStream(new FileOutputStream( filenameout + (ARFFSymbolFilter.strokenumber ? ("" + (k + 1)) : "") + ".sv")); constructAttributes(k + 1); } final int tot = symbolData.size(); for (int j = 0; j < symbolData.size(); ++j) { final ArrayList<?> group = (ArrayList<?>) symbolData.get(j); for (int i = 0; i < group.size(); ++i) { final Symbol sym = (Symbol) group.get(i); final int k = ARFFSymbolFilter.strokenumber ? (sym.size() - 1) : 0; if (sym.name.equals("no_name") || sym.name.equals("empty_symbol")) { System.out.print("#" + sym.name + "#"); } else { for (int t = 0; t < ARFFSymbolFilter.times; ++t) { final String line = constructStringSparseVector(sym, ARFFSymbolFilter.alpha); if (line == null) { System.out.print("inst=null"); } else { try { oldReplace(line, "\\", ""); } catch (ArrayIndexOutOfBoundsException aioobe) { System.out.print("!"); } fileout[k].writeBytes(line + "\n"); } } } } if ((int) (100.0 * j) / tot % 10 == 0) { System.out.print((int) (100.0 * j) / tot + "%-"); } } for (int k = 0; k < ARFFSymbolFilter.ns; ++k) { fileout[k].close(); } System.out.println("100.0%"); } catch (FileNotFoundException fnfe) { fnfe.printStackTrace(); } catch (Exception ioe) { ioe.printStackTrace(); } } public static void writeData(final String filenameout, final ArrayList<ArrayList<Symbol>> symbolData) { ARFFSymbolFilter.tangent = (ARFFSymbolFilter.times > 1); try { final DataOutputStream[] fileout = new DataOutputStream[ARFFSymbolFilter.ns]; System.out.println("Writing file"); @SuppressWarnings("unchecked") final ArrayList<String>[] names = new ArrayList[ARFFSymbolFilter.ns]; final int[] ndata = new int[ARFFSymbolFilter.ns]; for (int k = 0; k < ARFFSymbolFilter.ns; ++k) { names[k] = new ArrayList<String>(); } for (int j = 0; j < symbolData.size(); ++j) { final ArrayList<Symbol> group = symbolData.get(j); for (int i = 0; i < group.size(); ++i) { final Symbol sym = group.get(i); final int k = sym.size() - 1; try { int l; for (l = 0; l < names[k].size() && !names[k].get(l).equals(sym.name); ++l) { } if (l == names[k].size()) { names[k].add(sym.name); } final int[] array = ndata; final int n = k; ++array[n]; } catch (ArrayIndexOutOfBoundsException ex) { ex.printStackTrace(); } } } for (int k = 0; k < ARFFSymbolFilter.ns; ++k) { fileout[k] = new DataOutputStream(new FileOutputStream(filenameout + k + ".dat")); } int nex = 0; for (int j = 0; j < symbolData.size(); ++j) { final ArrayList<?> group = symbolData.get(j); for (int i = 0; i < group.size(); ++i) { final Symbol sym = (Symbol) group.get(i); final int k = sym.size() - 1; if (sym.name.equals("no_name") || sym.name.equals("empty_symbol")) { System.out.print("#" + sym.name + "#"); } else { for (int t = 0; t < ARFFSymbolFilter.times; ++t) { System.out.print("."); String line; try { line = getProcessedSingle(sym, ARFFSymbolFilter.alpha); } catch (Exception ex2) { System.out.print("*"); ++nex; continue; } if (ARFFSymbolFilter.labelOut) { int l; for (l = 0; l < names[k].size() && !names[k].get(l).equals(sym.name); ++l) { } fileout[k].writeBytes(l + " "); } else { fileout[k].writeBytes(sym.name + " "); } fileout[k].writeBytes(line + "\n"); } } } } for (int k = 0; k < ARFFSymbolFilter.ns; ++k) { fileout[k].close(); } if (nex != 0) { System.out.println("getProcessedSingle(Symbol,double) throwed " + nex + " Exceptions while processing your data."); } for (int k = 0; k < ARFFSymbolFilter.ns; ++k) { fileout[k] = new DataOutputStream(new FileOutputStream(filenameout + k + ".names")); for (int l = 0; l < names[k].size(); ++l) { fileout[k].writeBytes(names[k].get(l) + " "); } fileout[k].writeBytes("\n"); fileout[k].close(); } System.out.println(); } catch (FileNotFoundException fnfe) { fnfe.printStackTrace(); } catch (IOException ioe) { ioe.printStackTrace(); } } public static void writeGini(final String filenameout, final ArrayList<ArrayList<Symbol>> symbolData) { double noise = 0.0; ARFFSymbolFilter.tangent = (ARFFSymbolFilter.times > 1); try { final DataOutputStream[] fileout = new DataOutputStream[ARFFSymbolFilter.ns]; @SuppressWarnings("unchecked") final ArrayList<String>[] names = new ArrayList[ARFFSymbolFilter.ns]; final int[] count = new int[ARFFSymbolFilter.ns]; final int[] ndata = new int[ARFFSymbolFilter.ns]; System.out.println("Writing file"); for (int k = 0; k < ARFFSymbolFilter.ns; ++k) { names[k] = new ArrayList<String>(); } for (int j = 0; j < symbolData.size(); ++j) { final ArrayList<Symbol> group = symbolData.get(j); for (int i = 0; i < group.size(); ++i) { final Symbol sym = group.get(i); final int k = sym.size() - 1; try { int l; for (l = 0; l < names[k].size() && !names[k].get(l).equals(sym.name); ++l) { } if (l == names[k].size()) { names[k].add(sym.name); } final int[] array = ndata; final int n = k; ++array[n]; } catch (ArrayIndexOutOfBoundsException ex) { ex.printStackTrace(); } } } for (int k = 0; k < ARFFSymbolFilter.ns; ++k) { boolean oldds = false; fileout[k] = new DataOutputStream(new FileOutputStream(filenameout + (k + 1) + ".gini")); for (int j = 0; j < k + 1; ++j) { if (ARFFSymbolFilter.centerOfGravity) { final int[] array2 = count; final int n2 = k; ++array2[n2]; } if (ARFFSymbolFilter.centerOfGravity) { final int[] array3 = count; final int n3 = k; ++array3[n3]; } if (ARFFSymbolFilter.accumulatedAngle) { final int[] array4 = count; final int n4 = k; ++array4[n4]; } if (ARFFSymbolFilter.areaPoly2RelativeToBoundingBox) { final int[] array5 = count; final int n5 = k; ++array5[n5]; } if (ARFFSymbolFilter.aspect) { final int[] array6 = count; final int n6 = k; ++array6[n6]; } if (ARFFSymbolFilter.curliness) { final int[] array7 = count; final int n7 = k; ++array7[n7]; } if (ARFFSymbolFilter.firstLastPointsDistanceRelativeToLength) { final int[] array8 = count; final int n8 = k; ++array8[n8]; } if (ARFFSymbolFilter.getCurvature) { final int[] array9 = count; final int n9 = k; ++array9[n9]; } if (ARFFSymbolFilter.getCurvature2) { final int[] array10 = count; final int n10 = k; ++array10[n10]; } if (ARFFSymbolFilter.relativeWidth) { final int[] array11 = count; final int n11 = k; ++array11[n11]; } if (ARFFSymbolFilter.relativeHeight) { final int[] array12 = count; final int n12 = k; ++array12[n12]; } if (ARFFSymbolFilter.globalCos) { final int[] array13 = count; final int n13 = k; ++array13[n13]; } if (ARFFSymbolFilter.globalSin) { final int[] array14 = count; final int n14 = k; ++array14[n14]; } if (ARFFSymbolFilter.length) { final int[] array15 = count; final int n15 = k; ++array15[n15]; } if (ARFFSymbolFilter.linearity) { final int[] array16 = count; final int n16 = k; ++array16[n16]; } if (ARFFSymbolFilter.relativeFirstLastPointDistance) { final int[] array17 = count; final int n17 = k; ++array17[n17]; } if (ARFFSymbolFilter.relativeLengthFirstLastPointDistance) { final int[] array18 = count; final int n18 = k; ++array18[n18]; } if (ARFFSymbolFilter.sumDistanceError) { final int[] array19 = count; final int n19 = k; ++array19[n19]; } if (oldds) { final int[] array20 = count; final int n20 = k; ++array20[n20]; final int[] array21 = count; final int n21 = k; ++array21[n21]; final int[] array22 = count; final int n22 = k; ++array22[n22]; final int[] array23 = count; final int n23 = k; ++array23[n23]; } for (int i = 0; i < ARFFSymbolFilter.np; ++i) { if (ARFFSymbolFilter.getX) { final int[] array24 = count; final int n24 = k; ++array24[n24]; } if (ARFFSymbolFilter.getY) { final int[] array25 = count; final int n25 = k; ++array25[n25]; } if (ARFFSymbolFilter.lengthPosition && i != 0 && i != ARFFSymbolFilter.np - 1) { final int[] array26 = count; final int n26 = k; ++array26[n26]; } if (i < ARFFSymbolFilter.np - 1) { if (ARFFSymbolFilter.costurning) { final int[] array27 = count; final int n27 = k; ++array27[n27]; } if (ARFFSymbolFilter.sinturning) { final int[] array28 = count; final int n28 = k; ++array28[n28]; } } if (i < ARFFSymbolFilter.np - 2) { if (ARFFSymbolFilter.costurningchange) { final int[] array29 = count; final int n29 = k; ++array29[n29]; } if (ARFFSymbolFilter.sinturningchange) { final int[] array30 = count; final int n30 = k; ++array30[n30]; } } } oldds = true; } fileout[k].writeBytes(count[k] + "\n"); fileout[k].writeBytes(names[k].size() + "\n"); fileout[k].writeBytes(ndata[k] * ARFFSymbolFilter.times + "\n"); } final int tot = symbolData.size(); for (int j = 0; j < symbolData.size(); ++j) { final ArrayList<?> group = symbolData.get(j); for (int i = 0; i < group.size(); ++i) { final Symbol sym = (Symbol) group.get(i); final int k = sym.size() - 1; if (sym.name.equals("no_name") || sym.name.equals("empty_symbol")) { System.out.print("#" + sym.name + "#"); } else { for (int t = 0; t < ARFFSymbolFilter.times; ++t) { final String line = getProcessedSingle(sym, ARFFSymbolFilter.alpha); final StringTokenizer st = new StringTokenizer(line, " :\n\t"); final int ntk = st.countTokens() / 2; if (ntk == 0) { System.out.print("ntk == 0"); } else if (line.equals("")) { System.out.print("line.equals(\"\")"); } else if (ntk != count[k]) { System.out.print("ntk != count[k]"); } else { if (ARFFSymbolFilter.labelOut) { int l; for (l = 0; l < names[k].size() && !names[k].get(l).equals(sym.name); ++l) { } fileout[k].writeBytes(l + "\n"); } else { int lrand = -1; noise = 0.0; if (ARFFSymbolFilter.random) { lrand = (int) (names[k].size() * Math.random()); noise = ARFFSymbolFilter.rnd.nextDouble() * ARFFSymbolFilter.sigma; } for (int l = 0; l < names[k].size(); ++l) { if (names[k].get(l).equals(sym.name)) { fileout[k].writeBytes(1.0 - noise + "\n"); } else if (lrand == l) { fileout[k].writeBytes(noise + "\n"); } else { fileout[k].writeBytes("0.0\n"); } } } fileout[k].writeBytes("1\n"); for (int l = 0; l < ntk; ++l) { st.nextToken(); fileout[k].writeBytes(st.nextToken() + "\n"); } System.out.print("."); } } } } if ((int) (100.0 * j) / tot % 10 == 0) { System.out.print((int) (100.0 * j) / tot + "%"); } } for (int k = 0; k < ARFFSymbolFilter.ns; ++k) { fileout[k].close(); } System.out.println(); } catch (FileNotFoundException fnfe) { fnfe.printStackTrace(); } catch (IOException ioe) { ioe.printStackTrace(); } } public static void writeWeka(final String filenameout, final ArrayList<?> symbolData) { final int nsold = ARFFSymbolFilter.ns; ARFFSymbolFilter.tangent = (ARFFSymbolFilter.times > 1); try { if (!ARFFSymbolFilter.strokenumber) { ARFFSymbolFilter.ns = 1; } final DataOutputStream[] fileout = new DataOutputStream[ARFFSymbolFilter.ns]; final Instances[] instances = new Instances[ARFFSymbolFilter.ns]; System.out.println("Writing file"); for (int i = 0; i < ARFFSymbolFilter.ns; ++i) { final int k = ARFFSymbolFilter.strokenumber ? i : (nsold - 1); fileout[ARFFSymbolFilter.strokenumber ? i : 0] = new DataOutputStream(new FileOutputStream( filenameout + (ARFFSymbolFilter.strokenumber ? ("" + (k + 1)) : "") + ".arff#")); } final int tot = symbolData.size(); for (int j = 0; j < symbolData.size(); ++j) { final ArrayList<?> group = (ArrayList<?>) symbolData.get(j); for (int i = 0; i < group.size(); ++i) { final Symbol sym = (Symbol) group.get(i); final int k = ARFFSymbolFilter.strokenumber ? (sym.size() - 1) : 0; if (sym.name.equals("no_name") || sym.name.equals("empty_symbol")) { System.out.print("#" + sym.name + "#"); } else { for (int t = 0; t < ARFFSymbolFilter.times; ++t) { final String line = constructStringInstance(sym, ARFFSymbolFilter.alpha); if (line == null) { System.out.print("line=null!"); } else { if (instances[k] == null) { final StringTokenizer st = new StringTokenizer(line, " "); final int nt = st.countTokens() / 2; final FastVector att = new FastVector(); for (int kk = 0; kk < nt; ++kk) { final String token = st.nextToken(); att.addElement(new Attribute(new String(token))); st.nextToken(); } att.addElement(new Attribute("class", (FastVector) null)); (instances[k] = new Instances("Symbols of Size " + (k + 1), att, 1)) .setClassIndex(att.size() - 1); } final StringTokenizer st = new StringTokenizer(line, " "); final int nt = st.countTokens() / 2; final Instance inst = new Instance(nt + 1); for (int kk = 0; kk < nt; ++kk) { st.nextToken(); final String token = new String(st.nextToken()); inst.setValue(kk, Double.parseDouble(token)); } inst.setDataset(instances[k]); inst.setClassValue(oldReplace(sym.name, "\\", "")); instances[k].add(inst); } } } } if ((int) (100.0 * j) / tot % 10 == 0) { System.out.print((int) (100.0 * j) / tot + "%-"); } } for (int k = 0; k < ARFFSymbolFilter.ns; ++k) { if (fileout[ARFFSymbolFilter.strokenumber ? k : 0] == null) { System.out.println("fo" + fileout[ARFFSymbolFilter.strokenumber ? k : 0]); } if (instances[ARFFSymbolFilter.strokenumber ? k : 0] == null) { System.out.println("in:" + instances[ARFFSymbolFilter.strokenumber ? k : 0]); } fileout[ARFFSymbolFilter.strokenumber ? k : 0] .writeBytes(instances[ARFFSymbolFilter.strokenumber ? k : 0].toString()); fileout[ARFFSymbolFilter.strokenumber ? k : 0].close(); } final StringToNominal filter = new StringToNominal(); final String[] args = new String[4]; for (int k = 0; k < ARFFSymbolFilter.ns; ++k) { args[0] = "-i"; args[1] = filenameout + (ARFFSymbolFilter.strokenumber ? ("" + (k + 1)) : "") + ".arff#"; args[2] = "-o"; args[3] = filenameout + (ARFFSymbolFilter.strokenumber ? ("" + (k + 1)) : "") + ".arff"; Filter.filterFile(filter, args); new File(args[1]).delete(); } System.out.println("100.0%"); } catch (FileNotFoundException fnfe) { fnfe.printStackTrace(); } catch (Exception ioe) { ioe.printStackTrace(); } } }