Révision 2787

tmp/org.txm.core/src/java/org/txm/scripts/importer/SAttributesListener.groovy (revision 2787)
1
package org.txm.scripts.importer
2

  
3
import java.util.ArrayList;
4
import java.util.HashMap;
5

  
6
import javax.xml.stream.*;
7

  
8
import org.txm.utils.io.IOUtils
9

  
10
/**
11
 * Read an XML file and find out : XML elements, their attributes and recursivity level
12
 * names are lowercases
13
 * 
14
 * @author mdecorde
15
 *
16
 */
17
class SAttributesListener {
18

  
19
	private static HashMap<String,ArrayList<String>> structs = new HashMap<String, ArrayList<String>>();
20
	private static HashMap<String, Integer> structsCountProf = new HashMap<String, Integer>();
21
	private static HashMap<String, Integer> structsMaxProf = new HashMap<String, Integer>();
22
	private static String structPath = "/";
23
	private XMLStreamReader parser;
24

  
25
	SAttributesListener() {
26
		structs = new HashMap<String, ArrayList<String>>();
27
		structsCountProf = new HashMap<String, Integer>();
28
		structsMaxProf = new HashMap<String, Integer>();
29
		structPath = "/";
30
	}
31
	
32
	SAttributesListener(XMLStreamReader parser) {
33
		this();
34
		this.parser = parser;
35
	}
36
	
37
	public void appendResultsTo(SAttributesListener another) {
38
		structs = another.structs;
39
		structsCountProf = another.structsCountProf;
40
		structsMaxProf = another.structsMaxProf;
41
	}
42

  
43
	public void start(def parser) {
44
		this.parser = parser;
45
	}
46

  
47
	def W = "w";
48
	def ANA = "ana";
49
	def FORM = "form";
50
	/**
51
	 * Call this method for each START_ELEMENT stax event
52
	 * @param localname the element localname
53
	 */
54
	public void startElement(String localname) {
55
		localname = localname.toLowerCase();
56

  
57
		//String localname = parser.getLocalName();
58
		if(localname.equals(W)) return;
59
		if(localname.equals(ANA)) return;
60
		if(localname.equals(FORM)) return;
61

  
62
		structPath += localname+"/"
63
		//println "add: "+structPath
64
		def attrs = structs.get(localname)
65
		if (!structs.containsKey(localname)) {
66
			attrs = new HashSet();
67
			structs.put(localname, attrs);
68
			structsCountProf.put(localname, 0)
69
			structsMaxProf.put(localname, 0)
70
		} //else {
71
		
72
		// get structure recursion
73
		int prof = structsCountProf.get(localname)+1
74
		structsCountProf.put(localname, prof)
75
		if (structsMaxProf.get(localname) < prof )
76
			structsMaxProf.put(localname, prof)
77
		
78
		// get the structure attributes
79
		for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
80
			attrs << parser.getAttributeLocalName(i).toLowerCase();
81
		}
82
	}
83

  
84
	/**
85
	 * Call this method for each END_ELEMENT stax event
86
	 * @param localname the element localname
87
	 */
88
	public void endElement(String localname) {
89
		localname = localname.toLowerCase();
90
		//String localname = parser.getLocalName();
91
		if(localname.equals(W)) return;
92
		if(localname.equals(ANA)) return;
93
		if(localname.equals(FORM)) return;
94

  
95
		if (structPath.length() > 1) {
96
			int idx = structPath.lastIndexOf("/");
97
			if (idx > 0) {
98
				structPath = structPath.substring(0, idx)
99
				//println "end of $localname "+(structsCountProf.get(localname))
100
				//if (structsCountProf.get(localname) != null)
101
				structsCountProf.put(localname, structsCountProf.get(localname)-1)
102
			}
103
			//println "pop: "+structPath
104
		}
105
	}
106

  
107
	boolean firstGetStructs = true;
108
	public HashMap<String,ArrayList<String>> getStructs() {
109
		if (structsCountProf.get("div") > 0)
110
			structs.remove("div1")
111
		if (structsCountProf.get("div") > 1)
112
			structs.remove("div2")
113
		if (structsCountProf.get("div") > 2)
114
			structs.remove("div3")
115
		if (structsCountProf.get("div") > 3)
116
			structs.remove("div4")
117
		if (structsCountProf.get("div") > 4)
118
			structs.remove("div5")
119
		if (structsCountProf.get("div") > 5)
120
			structs.remove("div6")
121

  
122
		if (firstGetStructs) {
123
			firstGetStructs = false;
124
			def keys = []
125
			keys.addAll(structs.keySet());
126
			for( String key : keys) {
127
				def value = structs.get(key);
128
				structs.remove(key)
129
				structs.put(key.toLowerCase(), value);
130
			}
131
		}
132

  
133
		return structs;
134
	}
135

  
136
	boolean firstGetstructsCountProf = true;
137
	public HashMap<String, Integer> getProfs() {
138

  
139
		//		if (firstGetstructsCountProf) {
140
		//			firstGetstructsCountProf = false;
141
		//			def keys = []
142
		//			keys.addAll(structsCountProf.keySet());
143
		//			for( String key : keys) {
144
		//				def value = structsCountProf.get(key);
145
		//				structsCountProf.remove(key)
146
		//				structsCountProf.put(key.toLowerCase(), value);
147
		//			}
148
		//		}
149
		HashMap<String, Integer> clone = new HashMap<String, Integer>();
150
		for (String key : structsMaxProf.keySet()) {
151
			if (structsMaxProf.get(key) > 0)
152
				clone.put(key, structsMaxProf.get(key)-1)
153
			else
154
				clone.put(key, 0)
155
		}
156
		return clone;
157
	}
158
	
159
	public static SAttributesListener scanFile(File xmlFile) {
160
		return scanFile(xmlFile, null)
161
	}
162
	
163
	public void setParser(def parser) {
164
		this.parser = parser;
165
	}
166
	
167
	/**
168
	 * Merge results in the parentListener
169
	 * 
170
	 * @param xmlFile
171
	 * @param parentListener results are appended to the parentListener if any
172
	 * @return
173
	 */
174
	public static SAttributesListener scanFile(File xmlFile, SAttributesListener parentListener) {
175
		
176
		def start = false;
177
		def inputData = xmlFile.toURI().toURL().openStream();
178
		def factory = XMLInputFactory.newInstance();
179
		def parser = factory.createXMLStreamReader(inputData);
180
		
181
		SAttributesListener listener;
182
		if (parentListener != null) {
183
			listener = parentListener;
184
			listener.setParser(parser)
185
		} else {
186
			listener = new SAttributesListener(parser);
187
		}
188
		String TEXT = "text";
189
		//HashSet<String> types = new HashSet<String>();
190
		for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
191
			if (event == XMLStreamConstants.START_ELEMENT) { // start elem
192
				if (TEXT.equals(parser.getLocalName())) start = true;
193
				if (start) listener.startElement(parser.getLocalName())
194
			} else if (event == XMLStreamConstants.END_ELEMENT) { // end elem
195
				if (start) listener.endElement(parser.getLocalName())
196
				if (TEXT.equals(parser.getLocalName())) start = false;
197
			}
198
		}
199
		if (parser != null) parser.close();
200
		if (inputData != null) inputData.close();
201
		
202
		return listener
203
	}
204
	
205
	public static SAttributesListener scanFiles(File xmlDirectory, String wordTag) {
206
		SAttributesListener listener = new SAttributesListener()
207
		listener.W = wordTag
208
		for (File xmlFile : xmlDirectory.listFiles(IOUtils.HIDDENFILE_FILTER)) {
209
			if (xmlFile.isFile() && !xmlFile.isHidden() && xmlFile.getName().toLowerCase().endsWith(".xml")) {
210
				scanFile(xmlFile, listener); // results saved in 'listener' data
211
//				println "LISTENER RESULT with ${xmlFile.getName()}: "+listener
212
//				println " prof: "+listener.getStructs()
213
//				println " prof: "+listener.getProfs()
214
//				println " path: "+listener.structPath
215
			}
216
		}
217
		
218
		return listener;
219
	}
220
}
tmp/org.txm.core/src/java/org/txm/core/results/TXMResult.java (revision 2787)
1947 1947
	 * Gets a message indicating the start of computing.
1948 1948
	 * Dedicated to indicate the start of computing and, for example, the parameters used for the computing.
1949 1949
	 * 
1950
	 * Warning: at this point we don't know if all parameters are set -> you should avoid NPE
1951
	 * 
1950 1952
	 * @return a message indicating the start of computing
1951 1953
	 */
1952
	// FIXME: SJ: this method must become abstract in TXM0.8.1 and implemented by subclasses
1954
	// FIXME: SJ: this method must become abstract in TXM 0.8.1 and implemented by subclasses
1953 1955
	public String getComputingStartMessage() {
1954 1956
		return NLS.bind("Computing {0}...", this.getClass().getSimpleName());
1955 1957
	}
tmp/org.txm.core/src/java/org/txm/importer/SAttributesListener.java (revision 2787)
1
package org.txm.importer;
2

  
3
import java.io.File;
4
import java.io.IOException;
5
import java.io.InputStream;
6
import java.net.MalformedURLException;
7
import java.util.ArrayList;
8
import java.util.HashMap;
9
import java.util.HashSet;
10

  
11
import javax.xml.stream.*;
12

  
13
import org.txm.utils.io.IOUtils;
14

  
15
/**
16
 * Read an XML file and find out : XML elements, their attributes and recursivity level
17
 * names are lowercases
18
 * 
19
 * @author mdecorde
20
 *
21
 */
22
public class SAttributesListener {
23

  
24
	public HashMap<String,HashSet<String>> structs = new HashMap<String, HashSet<String>>();
25
	public HashSet<String> anatypes = new HashSet<String>();
26
	public HashMap<String, Integer> structsCountProf = new HashMap<String, Integer>();
27
	public HashMap<String, Integer> structsMaxProf = new HashMap<String, Integer>();
28
	private String structPath = "/";
29
	private XMLStreamReader parser;
30

  
31
	SAttributesListener() {
32
		structs = new HashMap<String, HashSet<String>>();
33
		structsCountProf = new HashMap<String, Integer>();
34
		structsMaxProf = new HashMap<String, Integer>();
35
		structPath = "/";
36
		anatypes = new HashSet<String>(); // store scanned word attributes
37
	}
38

  
39
	SAttributesListener(XMLStreamReader parser) {
40
		this();
41
		this.parser = parser;
42
	}
43

  
44
	public void appendResultsTo(SAttributesListener another) {
45
		structs = another.structs;
46
		structsCountProf = another.structsCountProf;
47
		structsMaxProf = another.structsMaxProf;
48
		anatypes = another.anatypes; // store scanned word attributes
49
	}
50

  
51
	public void start(XMLStreamReader parser) {
52
		this.parser = parser;
53
	}
54

  
55
	String W = "w";
56
	String ANA = "ana";
57
	String FORM = "form";
58
	/**
59
	 * Call this method for each START_ELEMENT stax event
60
	 * @param localname the element localname
61
	 */
62
	public void startElement(String localname) {
63
		localname = localname.toLowerCase();
64

  
65
		//String localname = parser.getLocalName();
66
		if(localname.equals(W)) return;
67
		if(localname.equals(ANA)) return;
68
		if(localname.equals(FORM)) return;
69

  
70
		structPath += localname+"/";
71
		//println "add: "+structPath
72
		HashSet<String> attrs = structs.get(localname);
73
		if (!structs.containsKey(localname)) {
74
			attrs = new HashSet<String>();
75
			structs.put(localname, attrs);
76
			structsCountProf.put(localname, 0);
77
			structsMaxProf.put(localname, 0);
78
		} //else {
79

  
80
		// get structure recursion
81
		int prof = structsCountProf.get(localname)+1;
82
		structsCountProf.put(localname, prof);
83
		if (structsMaxProf.get(localname) < prof) {
84
			structsMaxProf.put(localname, prof);
85
		}
86

  
87
		// get the structure attributes
88
		for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
89
			attrs.add(parser.getAttributeLocalName(i).toLowerCase());
90
		}
91
	}
92

  
93
	/**
94
	 * Call this method for each END_ELEMENT stax event
95
	 * @param localname the element localname
96
	 */
97
	public void endElement(String localname) {
98
		localname = localname.toLowerCase();
99
		//String localname = parser.getLocalName();
100
		if(localname.equals(W)) return;
101
		if(localname.equals(ANA)) return;
102
		if(localname.equals(FORM)) return;
103

  
104
		if (structPath.length() > 1) {
105
			int idx = structPath.lastIndexOf("/");
106
			if (idx > 0) {
107
				structPath = structPath.substring(0, idx);
108
				//println "end of $localname "+(structsCountProf.get(localname))
109
				//if (structsCountProf.get(localname) != null)
110
				structsCountProf.put(localname, structsCountProf.get(localname)-1);
111
			}
112
			//println "pop: "+structPath
113
		}
114
	}
115

  
116
//	boolean firstGetStructs = true;
117
	public HashMap<String,HashSet<String>> getStructs() {
118
		if (structsCountProf.containsKey("div")) {
119
			if (structsCountProf.get("div") > 0)
120
				structs.remove("div1");
121
			if (structsCountProf.get("div") > 1)
122
				structs.remove("div2");
123
			if (structsCountProf.get("div") > 2)
124
				structs.remove("div3");
125
			if (structsCountProf.get("div") > 3)
126
				structs.remove("div4");
127
			if (structsCountProf.get("div") > 4)
128
				structs.remove("div5");
129
			if (structsCountProf.get("div") > 5)
130
				structs.remove("div6");
131
		}
132
//		if (firstGetStructs) {
133
//			firstGetStructs = false;
134
		// fix min&maj names for CQP
135
			ArrayList<String> keys = new ArrayList<String>();
136
			keys.addAll(structs.keySet());
137
			for (String key : keys) {
138
				HashSet<String> value = structs.get(key);
139
				structs.remove(key);
140
				structs.put(key.toLowerCase(), value);
141
			}
142
//		}
143

  
144
		return structs;
145
	}
146

  
147
	boolean firstGetstructsCountProf = true;
148
	public HashMap<String, Integer> getProfs() {
149

  
150
		//		if (firstGetstructsCountProf) {
151
		//			firstGetstructsCountProf = false;
152
		//			def keys = []
153
		//			keys.addAll(structsCountProf.keySet());
154
		//			for( String key : keys) {
155
		//				def value = structsCountProf.get(key);
156
		//				structsCountProf.remove(key)
157
		//				structsCountProf.put(key.toLowerCase(), value);
158
		//			}
159
		//		}
160
		HashMap<String, Integer> clone = new HashMap<String, Integer>();
161
		for (String key : structsMaxProf.keySet()) {
162
			if (structsMaxProf.get(key) > 0)
163
				clone.put(key, structsMaxProf.get(key)-1);
164
			else
165
				clone.put(key, 0);
166
		}
167
		return clone;
168
	}
169

  
170
	public void initialize(ArrayList<String> pattributes, HashMap<String, HashSet<String>> sAttributesMap, HashMap<String, Integer> sAttributesProfs) {
171
		this.anatypes.addAll(pattributes);
172
		for (String s : sAttributesMap.keySet()) {
173
			this.structsMaxProf.put(s, sAttributesProfs.get(s));
174
			this.structsCountProf.put(s, 0);
175
			this.structs.put(s, sAttributesMap.get(s));
176
		}
177
	}
178
	
179
	public HashSet<String> getAnatypes() {
180
		return anatypes;
181
	}
182

  
183
//	public SAttributesListener scanFile(File xmlFile) throws MalformedURLException, IOException, XMLStreamException {
184
//		return scanFile(xmlFile, this);
185
//	}
186

  
187
	public void setParser(XMLStreamReader parser) {
188
		this.parser = parser;
189
	}
190

  
191
	/**
192
	 * Merge results in the parentListener
193
	 * 
194
	 * @param xmlFile
195
	 * @param parentListener results are appended to the parentListener if any
196
	 * @return
197
	 * @throws IOException 
198
	 * @throws MalformedURLException 
199
	 * @throws XMLStreamException 
200
	 */
201
	public SAttributesListener scanFile(File xmlFile) throws MalformedURLException, IOException, XMLStreamException {
202

  
203
		boolean startText = false;
204
		boolean startWord = false;
205
		InputStream inputData = xmlFile.toURI().toURL().openStream();
206
		XMLInputFactory factory = XMLInputFactory.newInstance();
207
		XMLStreamReader parser = factory.createXMLStreamReader(inputData);
208

  
209
//		SAttributesListener listener;
210
//		if (parentListener != null) {
211
//			listener = parentListener;
212
//			listener.setParser(parser);
213
//		} else {
214
//			listener = new SAttributesListener(parser);
215
//		}
216
		String TEXT = "text";
217
		String ANA = "ana";
218
		String TYPE = "type";
219
		//HashSet<String> types = new HashSet<String>();
220
		this.setParser(parser);
221
		for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
222
			if (event == XMLStreamConstants.START_ELEMENT) { // start elem
223
				if (TEXT.equals(parser.getLocalName())) startText = true;
224

  
225
				if (startText) this.startElement(parser.getLocalName());
226

  
227
				if (this.W.equals(parser.getLocalName())) {
228
					startWord = true;
229
				} else if (startWord && ANA.equals(parser.getLocalName())) { // ana elem
230
					for (int i = 0 ; i < parser.getAttributeCount(); i++) { // find @type
231
						if (TYPE.equals(parser.getAttributeLocalName(i))) { // @type
232
							this.anatypes.add(parser.getAttributeValue(i).substring(1)); //remove the #
233
							break;
234
						}
235
					}
236
				}
237
			} else if (event == XMLStreamConstants.END_ELEMENT) { // end elem
238
				if (startText) this.endElement(parser.getLocalName());
239
				if (TEXT.equals(parser.getLocalName())) startText = false;
240

  
241
				if (this.W.equals(parser.getLocalName())) {
242
					startWord = false;
243
				}
244
			}
245
		}
246
		if (parser != null) parser.close();
247
		if (inputData != null) inputData.close();
248

  
249
		return this;
250
	}
251

  
252
	/**
253
	 * scan the XML files of a directory to list the structures with their properties and levels. Also list the word properties
254
	 * @param xmlDirectory
255
	 * @param wordTag
256
	 * @return
257
	 * @throws XMLStreamException 
258
	 * @throws IOException 
259
	 * @throws MalformedURLException 
260
	 */
261
	public static SAttributesListener scanFiles(File xmlDirectory, String wordTag) throws MalformedURLException, IOException, XMLStreamException {
262
		SAttributesListener listener = new SAttributesListener();
263
		listener.W = wordTag;
264
		for (File xmlFile : xmlDirectory.listFiles(IOUtils.HIDDENFILE_FILTER)) {
265
			if (xmlFile.isFile() && !xmlFile.isHidden() && xmlFile.getName().toLowerCase().endsWith(".xml")) {
266
				listener.scanFile(xmlFile); // results saved in 'listener' data
267
				//				println "LISTENER RESULT with ${xmlFile.getName()}: "+listener
268
				//				println " prof: "+listener.getStructs()
269
				//				println " prof: "+listener.getProfs()
270
				//				println " path: "+listener.structPath
271
			}
272
		}
273

  
274
		return listener;
275
	}
276
}
0 277

  
tmp/org.txm.core/src/java/org/txm/importer/xtz/Compiler.java (revision 2787)
28 28
		inputDirectory = new File(module.getBinaryDirectory(), "txm/"+module.getCorpusName());
29 29
		cqpDirectory = new File(module.getBinaryDirectory(), "cqp");		
30 30
		outputDirectory = new File(module.getBinaryDirectory(), "data");
31
		registryDirectory = new File(module.getBinaryDirectory(), "registry");
32 31
		dataDirectory = new File(outputDirectory, module.getCorpusName());
32
		registryDirectory = new File(module.getBinaryDirectory(), "registry");
33 33

  
34 34
		DeleteDir.deleteDirectory(outputDirectory);
35 35
		outputDirectory.mkdirs();
36
		
37
		DeleteDir.deleteDirectory(dataDirectory);
38 36
		dataDirectory.mkdirs();
39 37
		
40 38
		DeleteDir.deleteDirectory(registryDirectory);
......
42 40
		
43 41
		if (!module.isUpdatingCorpus()) {
44 42
			DeleteDir.deleteDirectory(cqpDirectory);
45
			cqpDirectory.mkdir();
46
		}
43
		} 
44
		cqpDirectory.mkdir();
47 45
	}
48 46

  
49 47
	@Override

Formats disponibles : Unified diff