Révision 3015

tmp/org.txm.analec.rcp/src/org/txm/annotation/urs/imports/URSAnnotationsImporter.java (revision 3015)
34 34
import visuAnalec.vue.Vue;
35 35

  
36 36
public class URSAnnotationsImporter {
37

  
37
	
38 38
	protected MainCorpus mainCorpus;
39
	
39 40
	protected Corpus analecCorpus;
41
	
40 42
	protected File annotationsDirectory;
43
	
41 44
	protected Vue analecVue;
45
	
42 46
	protected IProgressMonitor monitor;
47
	
43 48
	protected XMLInputFactory factory;
49
	
44 50
	protected File aamFile;
51
	
45 52
	protected static PersonalNamespaceContext Nscontext = new PersonalNamespaceContext();
46

  
53
	
47 54
	public URSAnnotationsImporter(IProgressMonitor monitor, File outputDirectory, File aamFile, MainCorpus mainCorpus, Corpus analecCorpus, Vue analecVue) {
48 55
		this.annotationsDirectory = outputDirectory;
49 56
		this.aamFile = aamFile;
......
52 59
		this.analecVue = analecVue;
53 60
		this.monitor = monitor;
54 61
	}
55

  
62
	
56 63
	public boolean process() throws Exception {
57 64
		if (!annotationsDirectory.exists()) {
58
			System.out.println("Error: annotationsDirectory does not exist: "+annotationsDirectory.getAbsolutePath());
65
			System.out.println("Error: annotationsDirectory does not exist: " + annotationsDirectory.getAbsolutePath());
59 66
			return false;
60 67
		}
61 68
		if (!annotationsDirectory.isDirectory()) {
62
			System.out.println("Error: annotationsDirectory is not a directory: "+annotationsDirectory.getAbsolutePath());
69
			System.out.println("Error: annotationsDirectory is not a directory: " + annotationsDirectory.getAbsolutePath());
63 70
			return false;
64 71
		}
65 72
		if (!aamFile.exists()) {
66
			System.out.println("Warning: aamFile does not exist: "+aamFile.getAbsolutePath());
67
		} else {
68

  
73
			System.out.println("Warning: aamFile does not exist: " + aamFile.getAbsolutePath());
74
		}
75
		else {
76
			
69 77
			if (!FichiersGlozz.importerModeleGlozz(analecCorpus, aamFile)) {
70
				System.out.println("Error while importing Glozz model: "+aamFile);
78
				System.out.println("Error while importing Glozz model: " + aamFile);
71 79
				return false;
72 80
			}
73 81
		}
74 82
		File[] ursFiles = annotationsDirectory.listFiles(new FilenameFilter() {
83
			
75 84
			@Override
76 85
			public boolean accept(File dir, String name) {
77 86
				return name.matches(".+-urs\\.xml");
78 87
			}
79 88
		});
80

  
89
		
81 90
		if (ursFiles == null) {
82
			System.out.println("No XML files found in "+annotationsDirectory);
91
			System.out.println("No XML files found in " + annotationsDirectory);
83 92
			return false;
84 93
		}
85 94
		if (ursFiles.length == 0) {
86
			System.out.println("No XML files found in "+annotationsDirectory);
95
			System.out.println("No XML files found in " + annotationsDirectory);
87 96
			return false;
88 97
		}
89 98
		Arrays.sort(ursFiles);
90

  
99
		
91 100
		factory = XMLInputFactory.newInstance();
92 101
		List<String> cqpTextIds = Arrays.asList(mainCorpus.getCorpusTextIdsList());
93

  
94
		List<Integer> all_result_summary = Arrays.asList(0,0,0,0,0,0,0,0);
95

  
96
		if (monitor != null) monitor.subTask("Processing "+ursFiles.length+" texts...");
102
		
103
		List<Integer> all_result_summary = Arrays.asList(0, 0, 0, 0, 0, 0, 0, 0);
104
		
105
		if (monitor != null) monitor.subTask("Processing " + ursFiles.length + " texts...");
97 106
		for (File xmlTXMFile : ursFiles) {
98 107
			String textid = xmlTXMFile.getName().substring(0, xmlTXMFile.getName().length() - 8);
99
			System.out.println("Processing text: "+textid);
108
			System.out.println("Processing text: " + textid);
100 109
			if (cqpTextIds.contains(textid)) {
101
				if (monitor != null) monitor.subTask("Processing "+textid+" text...");
110
				if (monitor != null) monitor.subTask("Processing " + textid + " text...");
102 111
				// N unit, N unit error, N unit no match error, N unit too much match error, N Relation, N Relation error, N Schema, N Schema error
103 112
				List<Integer> result_summary = processText(textid, xmlTXMFile);
104

  
105
				System.out.println(textid+" import report: ");
106
				System.out.println(" N Units added: "+result_summary.get(0));
107
				System.out.println(" N Units error: "+result_summary.get(1));
108
				System.out.println(" N Units no match error: "+result_summary.get(2));
109
				System.out.println(" N Units too much match error: "+result_summary.get(3));
110
				System.out.println(" N Relations added: "+result_summary.get(4));
111
				System.out.println(" N Relations error: "+result_summary.get(5));
112
				System.out.println(" N Schemas added: "+result_summary.get(6));
113
				System.out.println(" N Schemas error: "+result_summary.get(7));
114

  
115
				for (int i = 0 ; i < all_result_summary.size() ; i++) all_result_summary.set(i, all_result_summary.get(i)+result_summary.get(i));
116

  
117
			} else {
118
				System.out.println("Warning: cannot found text with ID="+textid+" in current CQP corpus.");
113
				
114
				System.out.println(textid + " import report: ");
115
				System.out.println(" N Units added: " + result_summary.get(0));
116
				System.out.println(" N Units error: " + result_summary.get(1));
117
				System.out.println(" N Units no match error: " + result_summary.get(2));
118
				System.out.println(" N Units too much match error: " + result_summary.get(3));
119
				System.out.println(" N Relations added: " + result_summary.get(4));
120
				System.out.println(" N Relations error: " + result_summary.get(5));
121
				System.out.println(" N Schemas added: " + result_summary.get(6));
122
				System.out.println(" N Schemas error: " + result_summary.get(7));
123
				
124
				for (int i = 0; i < all_result_summary.size(); i++)
125
					all_result_summary.set(i, all_result_summary.get(i) + result_summary.get(i));
126
				
119 127
			}
120

  
128
			else {
129
				System.out.println("Warning: cannot found text with ID=" + textid + " in current CQP corpus.");
130
			}
131
			
121 132
			if (monitor != null && monitor.isCanceled()) {
122 133
				return false;
123 134
			}
124 135
		}
125

  
136
		
126 137
		if (!aamFile.exists()) {
127 138
			Vue vue = URSCorpora.getVue(analecCorpus);
128 139
			vue.retablirVueParDefaut();
129 140
		}
130

  
141
		
131 142
		System.out.println("Final import report: ");
132
		System.out.println(" N Units added: "+all_result_summary.get(0));
133
		System.out.println(" N Units error: "+all_result_summary.get(1));
134
		System.out.println(" N Units no match error: "+all_result_summary.get(2));
135
		System.out.println(" N Units too much match error: "+all_result_summary.get(3));
136
		System.out.println(" N Relations added: "+all_result_summary.get(4));
137
		System.out.println(" N Relations error: "+all_result_summary.get(5));
138
		System.out.println(" N Schemas added: "+all_result_summary.get(6));
139
		System.out.println(" N Schemas error: "+all_result_summary.get(7));
140

  
143
		System.out.println(" N Units added: " + all_result_summary.get(0));
144
		System.out.println(" N Units error: " + all_result_summary.get(1));
145
		System.out.println(" N Units no match error: " + all_result_summary.get(2));
146
		System.out.println(" N Units too much match error: " + all_result_summary.get(3));
147
		System.out.println(" N Relations added: " + all_result_summary.get(4));
148
		System.out.println(" N Relations error: " + all_result_summary.get(5));
149
		System.out.println(" N Schemas added: " + all_result_summary.get(6));
150
		System.out.println(" N Schemas error: " + all_result_summary.get(7));
151
		
141 152
		return true;
142 153
	}
143

  
154
	
144 155
	private int getPosition(Subcorpus textSubcorpus, HashMap<String, int[]> id2position, String id) {
145 156
		int[] positions = id2position.get(id);
146 157
		int start = textSubcorpus.getMatches().get(0).getStart();
147 158
		int end = textSubcorpus.getMatches().get(0).getEnd();
148 159
		if (positions.length == 0) { // no word for id=deb
149 160
			return -1;
150
		} 
151

  
161
		}
162
		
152 163
		for (int p : positions) {
153 164
			if (start <= p && p <= end) {
154 165
				return p;
155 166
			}
156 167
		}
157

  
168
		
158 169
		return -2;
159 170
	}
160

  
171
	
161 172
	private List<Integer> processText(String textid, File xmlTXMFile) throws Exception {
162 173
		AbstractCqiClient CQI = CQPSearchEngine.getCqiClient();
163 174
		// N unit, N unit error, N unit no match error, N unit too much match error, N Relation, N Relation error, N Schema, N Schema error
......
169 180
		int nRelationsError = 0;
170 181
		int nSchemaAdded = 0;
171 182
		int nSchemaError = 0;
172

  
183
		
173 184
		unites.clear();
174 185
		relations.clear();
175 186
		schemas.clear();
176 187
		elementProperties.clear();
177

  
188
		
178 189
		if (!parseXMLTXMFile(xmlTXMFile)) return null; // fill unites, relations, schemas and elementProperties
179

  
180
		//		if (unites.size() > 0) System.out.println(unites);
181
		//		if (relations.size() > 0) System.out.println(relations);
182
		//		if (schemas.size() > 0) System.out.println(schemas);
183
		//		if (elementProperties.size() > 0) System.out.println(elementProperties);
184
		//		for (String u : unites.keySet()) if (!elementProperties.containsKey(u+"-fs")) System.out.println("MISSIGN ELEM PROPERTIES: "+u);
185
		//		for (String u : relations.keySet()) if (!elementProperties.containsKey(u+"-fs")) System.out.println("MISSIGN ELEM PROPERTIES: "+u);
186
		//		for (String u : schemas.keySet()) if (!elementProperties.containsKey(u+"-fs")) System.out.println("MISSIGN ELEM PROPERTIES: "+u);
187

  
188

  
189
		CQLQuery textQuery = new CQLQuery("[_.text_id=\""+textid+"\"] expand to text");
190
		
191
		// if (unites.size() > 0) System.out.println(unites);
192
		// if (relations.size() > 0) System.out.println(relations);
193
		// if (schemas.size() > 0) System.out.println(schemas);
194
		// if (elementProperties.size() > 0) System.out.println(elementProperties);
195
		// for (String u : unites.keySet()) if (!elementProperties.containsKey(u+"-fs")) System.out.println("MISSIGN ELEM PROPERTIES: "+u);
196
		// for (String u : relations.keySet()) if (!elementProperties.containsKey(u+"-fs")) System.out.println("MISSIGN ELEM PROPERTIES: "+u);
197
		// for (String u : schemas.keySet()) if (!elementProperties.containsKey(u+"-fs")) System.out.println("MISSIGN ELEM PROPERTIES: "+u);
198
		
199
		CQLQuery textQuery = new CQLQuery("[_.text_id=\"" + textid + "\"] expand to text");
190 200
		Subcorpus textSubCorpus = mainCorpus.createSubcorpus(textQuery, "TXTTMP");
191 201
		textSubCorpus.compute();
192

  
193
		HashSet<String> ids = new HashSet<String>();
202
		
203
		HashSet<String> ids = new HashSet<>();
194 204
		for (String id : unites.keySet()) {
195 205
			ids.add(unites.get(id)[2]);
196 206
			ids.add(unites.get(id)[3]);
197 207
		}
198 208
		String[] idsArray = ids.toArray(new String[ids.size()]);
199 209
		int[] idsIds = CQI.str2Id(mainCorpus.getProperty("id").getQualifiedName(), idsArray);
200

  
201
		HashMap<String, int[]> id2position = new HashMap<String, int[]>();
202
		for (int i = 0 ; i < idsArray.length ; i++) {
210
		
211
		HashMap<String, int[]> id2position = new HashMap<>();
212
		for (int i = 0; i < idsArray.length; i++) {
203 213
			int[] positions = CQI.id2Cpos(mainCorpus.getProperty("id").getQualifiedName(), idsIds[i]);
204 214
			id2position.put(idsArray[i], positions);
205 215
		}
206

  
216
		
207 217
		Structure structure = analecCorpus.getStructure();
208 218
		HashSet<String> unitesStructure = structure.getUnites();
209 219
		HashSet<String> relationsStructure = structure.getRelations();
210 220
		HashSet<String> schemasStructure = structure.getSchemas();
211

  
221
		
212 222
		ConsoleProgressBar cpb = new ConsoleProgressBar(unites.size());
213 223
		for (String id : unites.keySet()) {
214 224
			cpb.tick();
......
220 230
			String deb = unites.get(id)[2];
221 231
			String fin = unites.get(id)[3];
222 232
			if (elementProperties.containsKey(ana)) {
223

  
233
				
224 234
				int start = getPosition(textSubCorpus, id2position, deb);
225 235
				int end = getPosition(textSubCorpus, id2position, fin);
226

  
236
				
227 237
				if (start < 0) {
228 238
					nUnitsError++;
229 239
					if (start == -1) {
230
						System.out.println("WARNING: no position found word id = "+deb);
240
						System.out.println("WARNING: no position found word id = " + deb);
231 241
						nUnitsNoMatchError++;
232
					} else {
233
						System.out.println("WARNING: too many positions found for word id = "+deb);
242
					}
243
					else {
244
						System.out.println("WARNING: too many positions found for word id = " + deb);
234 245
						nUnitsTooMuchMatchError++;
235 246
					}
236
				} else if (end < 0) {
247
				}
248
				else if (end < 0) {
237 249
					nUnitsError++;
238 250
					if (end == -1) {
239
						System.out.println("WARNING: no position found word id = "+fin);
251
						System.out.println("WARNING: no position found word id = " + fin);
240 252
						nUnitsNoMatchError++;
241
					} else {
242
						System.out.println("WARNING: too many positions found for word id = "+fin);
253
					}
254
					else {
255
						System.out.println("WARNING: too many positions found for word id = " + fin);
243 256
						nUnitsTooMuchMatchError++;
244 257
					}
245
				} else { // OK
246

  
247
					//System.out.println("create unite: "+type+" ["+deb+", "+fin+"]");
258
				}
259
				else { // OK
260
					
261
					// System.out.println("create unite: "+type+" ["+deb+", "+fin+"]");
248 262
					Unite unite = analecCorpus.addUniteSaisie(type, start, end);
249 263
					HashMap<String, String> props = elementProperties.get(ana);
250 264
					HashSet<String> nomsProps = structure.getNomsProps(Unite.class, type);
......
257 271
					unitesRef.put(id, unite);
258 272
					nUnitsAdded++;
259 273
				}
260
			} else {
261
				System.out.println("Warning no properties found for element id="+id+" and ana="+ana);
262 274
			}
275
			else {
276
				System.out.println("Warning no properties found for element id=" + id + " and ana=" + ana);
277
			}
263 278
		}
264 279
		cpb.done();
265

  
280
		
266 281
		cpb = new ConsoleProgressBar(relations.size());
267 282
		for (String id : relations.keySet()) {
268 283
			cpb.tick();
......
272 287
				structure.ajouterType(Relation.class, type);
273 288
			}
274 289
			String target = relations.get(id)[2];
275

  
290
			
276 291
			if (elementProperties.containsKey(ana)) {
277

  
292
				
278 293
				String[] wordsref = target.split(" ");
279 294
				String[] wordsid = new String[wordsref.length];
280
				for (int i = 0 ; i < wordsref.length ; i++) wordsid[i] = wordsref[i].substring(1);
295
				for (int i = 0; i < wordsref.length; i++)
296
					wordsid[i] = wordsref[i].substring(1);
281 297
				try {
282 298
					Unite elt1 = unitesRef.get(wordsid[0]);
283 299
					Unite elt2 = unitesRef.get(wordsid[1]);
284 300
					if (elt1 != null && elt2 != null) {
285
						//System.out.println("create relation: "+type+" ["+deb+", "+fin+"]");
286

  
301
						// System.out.println("create relation: "+type+" ["+deb+", "+fin+"]");
302
						
287 303
						Relation relation = analecCorpus.addRelationSaisie(type, elt1, elt2);
288 304
						HashMap<String, String> props = elementProperties.get(ana);
289 305
						HashSet<String> nomsProps = structure.getNomsProps(Relation.class, type);
......
294 310
							relation.putProp(prop, props.get(prop));
295 311
						}
296 312
						nRelationsAdded++;
297
					} else {
298
						System.out.println("ERROR: relation element not found "+Arrays.toString(wordsref));
313
					}
314
					else {
315
						System.out.println("ERROR: relation element not found " + Arrays.toString(wordsref));
299 316
						nRelationsError++;
300 317
					}
301
				} catch(Exception e) {
302

  
303 318
				}
304
			} else {
305
				System.out.println("Warning no properties found for element id="+id);
319
				catch (Exception e) {
320
					
321
				}
306 322
			}
323
			else {
324
				System.out.println("Warning no properties found for element id=" + id);
325
			}
307 326
		}
308 327
		cpb.done();
309

  
328
		
310 329
		cpb = new ConsoleProgressBar(schemas.size());
311 330
		for (String id : schemas.keySet()) {
312 331
			cpb.tick();
......
319 338
			if (elementProperties.containsKey(ana)) {
320 339
				String[] unitsref = target.split(" ");
321 340
				try {
322
					//System.out.println("create relation: "+type+" ["+deb+", "+fin+"]");
341
					// System.out.println("create relation: "+type+" ["+deb+", "+fin+"]");
323 342
					Schema schema = new Schema(type);
324 343
					for (String unitid : unitsref) {
325 344
						unitid = unitid.substring(1); // remove '#'
326 345
						if (unitesRef.containsKey(unitid)) {
327 346
							Unite unite = unitesRef.get(unitid);
328 347
							schema.ajouter(unite);
329
						} else {
330
							System.out.println("Warning: missing unit id: "+unitid);
348
						}
349
						else {
350
							System.out.println("Warning: missing unit id: " + unitid);
331 351
							nSchemaError++;
332 352
						}
333 353
					}
......
341 361
						schema.putProp(prop, props.get(prop));
342 362
					}
343 363
					nSchemaAdded++;
344
				} catch(Exception e) {
345
					System.out.println("Error while creating schema with id="+id);
346 364
				}
347
			} else {
348
				System.out.println("Warning no properties found for element id="+id);
365
				catch (Exception e) {
366
					System.out.println("Error while creating schema with id=" + id);
367
				}
349 368
			}
369
			else {
370
				System.out.println("Warning no properties found for element id=" + id);
371
			}
350 372
		}
351 373
		cpb.done();
352

  
374
		
353 375
		textSubCorpus.delete();
354 376
		return Arrays.asList(nUnitsAdded, nUnitsError, nUnitsNoMatchError, nUnitsTooMuchMatchError,
355 377
				nRelationsAdded, nRelationsError,
356 378
				nSchemaAdded, nSchemaError);
357 379
	}
358

  
380
	
359 381
	private boolean parseXMLTXMFile(File xmlTXMFile) throws XMLStreamException, MalformedURLException, IOException {
360 382
		InputStream inputData = xmlTXMFile.toURI().toURL().openStream();
361 383
		factory = XMLInputFactory.newInstance();
362 384
		XMLStreamReader parser = factory.createXMLStreamReader(inputData);
363

  
385
		
364 386
		String currentType = null;
365 387
		String currentN = null;
366 388
		String currentAna = null;
367 389
		String currentName = null;
368 390
		String currentPropValue = null;
369

  
391
		
370 392
		int processMode = 0; // 0 nothing, 1 elements 2 properties
371

  
393
		
372 394
		if (!goToStandOff(parser)) {
373
			System.out.println("Error: cannot find the 'standOff' element in "+xmlTXMFile);
395
			System.out.println("Error: cannot find the 'standOff' element in " + xmlTXMFile);
374 396
			return false;
375 397
		}
376 398
		String localname = null;
377 399
		for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
378 400
			switch (event) {
379
			case XMLStreamConstants.START_ELEMENT:
380
				localname = parser.getLocalName();
381

  
382
				if (localname.equals("annotationGrp")) {
383
					currentType = parser.getAttributeValue(null, "type");
384
					currentN = parser.getAttributeValue(null, "subtype");
385
					if (currentType != null && currentN != null) {
386
						processMode = 1;
387
					} else {
388
						System.out.println("Warning found "+localname+" without 'type' and 'n' attribute");
401
				case XMLStreamConstants.START_ELEMENT:
402
					localname = parser.getLocalName();
403
					
404
					if (localname.equals("annotationGrp")) {
405
						currentType = parser.getAttributeValue(null, "type");
406
						currentN = parser.getAttributeValue(null, "subtype");
407
						if (currentType != null && currentN != null) {
408
							processMode = 1;
409
						}
410
						else {
411
							System.out.println("Warning found " + localname + " without 'type' and 'n' attribute");
412
							processMode = 0;
413
						}
414
					}
415
					else if (localname.equals("div")) {
416
						String type = parser.getAttributeValue(null, "type");
417
						if (type == null) type = "";
418
						if (type.endsWith("-fs"))
419
							processMode = 2;
420
					}
421
					
422
					if (processMode == 1) {
423
						if (localname.equals("span") || localname.equals("link")) {
424
							String id = parser.getAttributeValue(null, "id");
425
							String ana = parser.getAttributeValue(null, "ana");
426
							// System.out.println("SPAN id="+id +" ana="+ana+" currentType="+currentType+" currentN="+currentN);
427
							if ("Unit".equals(currentType)) {
428
								String from = parser.getAttributeValue(null, "from");
429
								String to = parser.getAttributeValue(null, "to");
430
								registerUnite(id, ana, currentN, from, to);
431
							}
432
							else if ("Relation".equals(currentType)) {
433
								String target = parser.getAttributeValue(null, "target");
434
								registerRelation(id, ana, currentN, target);
435
							}
436
							else if ("Schema".equals(currentType)) {
437
								String target = parser.getAttributeValue(null, "target");
438
								registerSchema(id, ana, currentN, target);
439
							}
440
						}
441
					}
442
					else if (processMode == 2) {
443
						if (localname.equals("fs")) {
444
							currentAna = parser.getAttributeValue(null, "id");
445
							if (elementProperties.get(currentAna) != null) System.out.println("WARNING: duplicated element properties: " + currentAna);
446
							elementProperties.put(currentAna, new HashMap<String, String>());
447
						}
448
						else if (localname.equals("f")) {
449
							currentName = parser.getAttributeValue(null, "name");
450
							currentPropValue = "";
451
						}
452
					}
453
					else {
454
						// nothing to do
455
					}
456
					
457
					break;
458
				case XMLStreamConstants.CHARACTERS:
459
					if (processMode == 2 && currentAna != null && currentName != null) {
460
						currentPropValue += parser.getText();
461
					}
462
					break;
463
				case XMLStreamConstants.END_ELEMENT:
464
					localname = parser.getLocalName();
465
					if (localname.equals("standOff")) { // stop all
466
						parser.close();
467
						inputData.close();
468
						return true;
469
					}
470
					else if (processMode == 1 && localname.equals("annotationGrp")) { // stop all
471
						currentType = null;
472
						currentN = null;
389 473
						processMode = 0;
390 474
					}
391
				} else if (localname.equals("div")) {
392
					String type = parser.getAttributeValue(null, "type");
393
					if (type == null) type = "";
394
					if (type.endsWith("-fs"))
395
						processMode = 2;
396
				} 
397

  
398
				if (processMode == 1) {
399
					if (localname.equals("span") || localname.equals("link")) {
400
						String id = parser.getAttributeValue(null, "id");
401
						String ana = parser.getAttributeValue(null, "ana");
402
						//System.out.println("SPAN id="+id +" ana="+ana+" currentType="+currentType+" currentN="+currentN);
403
						if ("Unit".equals(currentType)) {
404
							String from = parser.getAttributeValue(null, "from");
405
							String to = parser.getAttributeValue(null, "to");
406
							registerUnite(id, ana, currentN, from, to);
407
						} else if ("Relation".equals(currentType)) {
408
							String target = parser.getAttributeValue(null, "target");
409
							registerRelation(id, ana, currentN, target);
410
						} else if ("Schema".equals(currentType)) {
411
							String target = parser.getAttributeValue(null, "target");
412
							registerSchema(id, ana, currentN, target);
475
					else if (processMode == 2 && localname.equals("div")) {
476
						processMode = 0;
477
					}
478
					
479
					if (processMode == 1) {
480
						
481
					}
482
					else if (processMode == 2) {
483
						if (localname.equals("fs")) {
484
							currentAna = null;
413 485
						}
486
						else if (localname.equals("f")) {
487
							if (currentName != null && currentAna != null) {
488
								elementProperties.get(currentAna).put(currentName, currentPropValue);
489
							}
490
							currentName = null;
491
						}
414 492
					}
415
				} else if (processMode == 2) {
416
					if (localname.equals("fs")) {
417
						currentAna = parser.getAttributeValue(null, "id");
418
						if (elementProperties.get(currentAna) != null) System.out.println("WARNING: duplicated element properties: "+currentAna);
419
						elementProperties.put(currentAna, new HashMap<String, String>());
420
					} else if (localname.equals("f")) {
421
						currentName= parser.getAttributeValue(null, "name");
422
						currentPropValue = "";
493
					else {
494
						// nothing to do
423 495
					}
424
				} else {
425
					// nothing to do
426
				} 
427

  
428
				break;		
429
			case XMLStreamConstants.CHARACTERS:
430
				if (processMode == 2 && currentAna != null && currentName != null) {
431
					currentPropValue += parser.getText();
432
				}
433
				break;
434
			case XMLStreamConstants.END_ELEMENT:
435
				localname = parser.getLocalName();
436
				if (localname.equals("standOff")) { // stop all
496
					
497
					break;
498
				
499
				case XMLStreamConstants.END_DOCUMENT:
437 500
					parser.close();
438 501
					inputData.close();
439
					return true;
440
				} else if (processMode == 1 && localname.equals("annotationGrp")) { // stop all
441
					currentType = null;
442
					currentN = null;
443
					processMode = 0;
444
				} else if (processMode == 2 && localname.equals("div")) {
445
					processMode = 0;
446
				}
447

  
448
				if (processMode == 1) {
449

  
450
				} else if (processMode == 2) {
451
					if (localname.equals("fs")) {
452
						currentAna = null;
453
					} else if (localname.equals("f")) {
454
						if (currentName != null && currentAna != null) {
455
							elementProperties.get(currentAna).put(currentName, currentPropValue);
456
						}
457
						currentName = null;
458
					}
459
				} else {
460
					// nothing to do
461
				}
462

  
463
				break;
464

  
465
			case XMLStreamConstants.END_DOCUMENT:
466
				parser.close();
467
				inputData.close();
468
				return false;
502
					return false;
469 503
			}
470 504
		}
471 505
		parser.close();
472 506
		inputData.close();
473

  
507
		
474 508
		return false; // standOff not found
475 509
	}
476

  
477
	HashMap<String, HashMap<String, String>> elementProperties = new HashMap<String, HashMap<String, String>>();
478
	HashMap<String, String[]> unites = new HashMap<String, String[]>(); // id -> {ana, type, from, to}
479
	HashMap<String, Unite> unitesRef = new HashMap<String, Unite>(); // id -> Unite
510
	
511
	HashMap<String, HashMap<String, String>> elementProperties = new HashMap<>();
512
	
513
	HashMap<String, String[]> unites = new HashMap<>(); // id -> {ana, type, from, to}
514
	
515
	HashMap<String, Unite> unitesRef = new HashMap<>(); // id -> Unite
516
	
480 517
	private void registerUnite(String id, String ana, String type, String from, String to) {
481
		String[] data = {ana.substring(1), type, from.substring(5), to.substring(5)};
518
		String[] data = { ana.substring(1), type, from.substring(5), to.substring(5) };
482 519
		if (unites.containsKey(id)) {
483
			System.out.println("Warning: duplicated Unite id: "+id);
484
		} else {
520
			System.out.println("Warning: duplicated Unite id: " + id);
521
		}
522
		else {
485 523
			unites.put(id, data);
486 524
		}
487 525
	}
488
	HashMap<String, String[]> relations = new HashMap<String, String[]>(); // id -> {ana, type, target}
526
	
527
	HashMap<String, String[]> relations = new HashMap<>(); // id -> {ana, type, target}
528
	
489 529
	private void registerRelation(String id, String ana, String type, String target) {
490
		String[] data = {ana.substring(1), type, target};
530
		String[] data = { ana.substring(1), type, target };
491 531
		if (relations.containsKey(id)) {
492
			System.out.println("Warning: duplicated Relation id: "+id);
493
		} else {
532
			System.out.println("Warning: duplicated Relation id: " + id);
533
		}
534
		else {
494 535
			relations.put(id, data);
495 536
		}
496 537
	}
497
	HashMap<String, String[]> schemas = new HashMap<String, String[]>(); // id -> {ana, type, target}
538
	
539
	HashMap<String, String[]> schemas = new HashMap<>(); // id -> {ana, type, target}
540
	
498 541
	private void registerSchema(String id, String ana, String type, String target) {
499
		String[] data = {ana.substring(1), type, target};
500
		//System.out.println("R schema: "+id+" : "+Arrays.toString(data));
542
		String[] data = { ana.substring(1), type, target };
543
		// System.out.println("R schema: "+id+" : "+Arrays.toString(data));
501 544
		if (schemas.containsKey(id)) {
502
			System.out.println("Warning: duplicated Schema id: "+id);
503
		} else {
545
			System.out.println("Warning: duplicated Schema id: " + id);
546
		}
547
		else {
504 548
			schemas.put(id, data);
505 549
		}
506 550
	}
507

  
551
	
508 552
	private boolean goToStandOff(XMLStreamReader parser) throws XMLStreamException {
509 553
		for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
510 554
			switch (event) {
511
			case XMLStreamConstants.START_ELEMENT:
512
				String localname = parser.getLocalName();
513
				if (localname.equals("standOff")) return true;
514
				break;		
555
				case XMLStreamConstants.START_ELEMENT:
556
					String localname = parser.getLocalName();
557
					if (localname.equals("standOff")) return true;
558
					break;
515 559
			}
516 560
		}
517 561
		return false;
tmp/org.txm.core/src/java/org/txm/importer/SAttributesListener.java (revision 3015)
20 20
 *
21 21
 */
22 22
public class SAttributesListener {
23

  
24
	public HashMap<String,HashSet<String>> structs = new HashMap<String, HashSet<String>>();
25
	public HashSet<String> anatypes = new HashSet<String>();
26
	public HashMap<String, Integer> structsCountProf = new HashMap<String, Integer>();
27
	public HashMap<String, Integer> structsMaxProf = new HashMap<String, Integer>();
23
	
24
	public HashMap<String, HashSet<String>> structs = new HashMap<>();
25
	
26
	public HashSet<String> anatypes = new HashSet<>();
27
	
28
	public HashMap<String, Integer> structsCountProf = new HashMap<>();
29
	
30
	public HashMap<String, Integer> structsMaxProf = new HashMap<>();
31
	
28 32
	private String structPath = "/";
33
	
29 34
	private XMLStreamReader parser;
30

  
35
	
31 36
	SAttributesListener() {
32
		structs = new HashMap<String, HashSet<String>>();
33
		structsCountProf = new HashMap<String, Integer>();
34
		structsMaxProf = new HashMap<String, Integer>();
37
		structs = new HashMap<>();
38
		structsCountProf = new HashMap<>();
39
		structsMaxProf = new HashMap<>();
35 40
		structPath = "/";
36
		anatypes = new HashSet<String>(); // store scanned word attributes
41
		anatypes = new HashSet<>(); // store scanned word attributes
37 42
	}
38

  
43
	
39 44
	SAttributesListener(XMLStreamReader parser) {
40 45
		this();
41 46
		this.parser = parser;
42 47
	}
43

  
48
	
44 49
	public void appendResultsTo(SAttributesListener another) {
45 50
		structs = another.structs;
46 51
		structsCountProf = another.structsCountProf;
47 52
		structsMaxProf = another.structsMaxProf;
48 53
		anatypes = another.anatypes; // store scanned word attributes
49 54
	}
50

  
55
	
51 56
	public void start(XMLStreamReader parser) {
52 57
		this.parser = parser;
53 58
	}
54

  
59
	
55 60
	String W = "w";
61
	
56 62
	String ANA = "ana";
63
	
57 64
	String FORM = "form";
65
	
58 66
	/**
59 67
	 * Call this method for each START_ELEMENT stax event
68
	 * 
60 69
	 * @param localname the element localname
61 70
	 */
62 71
	public void startElement(String localname) {
63 72
		localname = localname.toLowerCase();
64

  
65
		//String localname = parser.getLocalName();
66
		if(localname.equals(W)) return;
67
		if(localname.equals(ANA)) return;
68
		if(localname.equals(FORM)) return;
69

  
70
		structPath += localname+"/";
71
		//println "add: "+structPath
73
		
74
		// String localname = parser.getLocalName();
75
		if (localname.equals(W)) return;
76
		if (localname.equals(ANA)) return;
77
		if (localname.equals(FORM)) return;
78
		
79
		structPath += localname + "/";
80
		// println "add: "+structPath
72 81
		HashSet<String> attrs = structs.get(localname);
73 82
		if (!structs.containsKey(localname)) {
74
			attrs = new HashSet<String>();
83
			attrs = new HashSet<>();
75 84
			structs.put(localname, attrs);
76 85
			structsCountProf.put(localname, 0);
77 86
			structsMaxProf.put(localname, 0);
78
		} //else {
79

  
87
		} // else {
88
		
80 89
		// get structure recursion
81
		int prof = structsCountProf.get(localname)+1;
90
		int prof = structsCountProf.get(localname) + 1;
82 91
		structsCountProf.put(localname, prof);
83 92
		if (structsMaxProf.get(localname) < prof) {
84 93
			structsMaxProf.put(localname, prof);
85 94
		}
86

  
95
		
87 96
		// get the structure attributes
88
		for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
97
		for (int i = 0; i < parser.getAttributeCount(); i++) {
89 98
			attrs.add(parser.getAttributeLocalName(i).toLowerCase());
90 99
		}
91 100
	}
92

  
101
	
93 102
	/**
94 103
	 * Call this method for each END_ELEMENT stax event
104
	 * 
95 105
	 * @param localname the element localname
96 106
	 */
97 107
	public void endElement(String localname) {
98 108
		localname = localname.toLowerCase();
99
		//String localname = parser.getLocalName();
100
		if(localname.equals(W)) return;
101
		if(localname.equals(ANA)) return;
102
		if(localname.equals(FORM)) return;
103

  
109
		// String localname = parser.getLocalName();
110
		if (localname.equals(W)) return;
111
		if (localname.equals(ANA)) return;
112
		if (localname.equals(FORM)) return;
113
		
104 114
		if (structPath.length() > 1) {
105 115
			int idx = structPath.lastIndexOf("/");
106 116
			if (idx > 0) {
107 117
				structPath = structPath.substring(0, idx);
108
				//println "end of $localname "+(structsCountProf.get(localname))
109
				//if (structsCountProf.get(localname) != null)
110
				structsCountProf.put(localname, structsCountProf.get(localname)-1);
118
				// println "end of $localname "+(structsCountProf.get(localname))
119
				// if (structsCountProf.get(localname) != null)
120
				structsCountProf.put(localname, structsCountProf.get(localname) - 1);
111 121
			}
112
			//println "pop: "+structPath
122
			// println "pop: "+structPath
113 123
		}
114 124
	}
115

  
116
//	boolean firstGetStructs = true;
117
	public HashMap<String,HashSet<String>> getStructs() {
118
		if (structsCountProf.containsKey("div")) {
119
			if (structsCountProf.get("div") > 0)
125
	
126
	// boolean firstGetStructs = true;
127
	public HashMap<String, HashSet<String>> getStructs() {
128
		if (structsMaxProf.containsKey("div")) {
129
			if (structsMaxProf.get("div") > 0)
120 130
				structs.remove("div1");
121
			if (structsCountProf.get("div") > 1)
131
			if (structsMaxProf.get("div") > 1)
122 132
				structs.remove("div2");
123
			if (structsCountProf.get("div") > 2)
133
			if (structsMaxProf.get("div") > 2)
124 134
				structs.remove("div3");
125
			if (structsCountProf.get("div") > 3)
135
			if (structsMaxProf.get("div") > 3)
126 136
				structs.remove("div4");
127
			if (structsCountProf.get("div") > 4)
137
			if (structsMaxProf.get("div") > 4)
128 138
				structs.remove("div5");
129
			if (structsCountProf.get("div") > 5)
139
			if (structsMaxProf.get("div") > 5)
130 140
				structs.remove("div6");
131 141
		}
132
//		if (firstGetStructs) {
133
//			firstGetStructs = false;
142
		// if (firstGetStructs) {
143
		// firstGetStructs = false;
134 144
		// fix min&maj names for CQP
135
			ArrayList<String> keys = new ArrayList<String>();
136
			keys.addAll(structs.keySet());
137
			for (String key : keys) {
138
				HashSet<String> value = structs.get(key);
139
				structs.remove(key);
140
				structs.put(key.toLowerCase(), value);
141
			}
142
//		}
143

  
145
		ArrayList<String> keys = new ArrayList<>();
146
		keys.addAll(structs.keySet());
147
		for (String key : keys) {
148
			HashSet<String> value = structs.get(key);
149
			structs.remove(key);
150
			structs.put(key.toLowerCase(), value);
151
		}
152
		// }
153
		
144 154
		return structs;
145 155
	}
146

  
156
	
147 157
	boolean firstGetstructsCountProf = true;
158
	
148 159
	public HashMap<String, Integer> getProfs() {
149

  
150
		//		if (firstGetstructsCountProf) {
151
		//			firstGetstructsCountProf = false;
152
		//			def keys = []
153
		//			keys.addAll(structsCountProf.keySet());
154
		//			for( String key : keys) {
155
		//				def value = structsCountProf.get(key);
156
		//				structsCountProf.remove(key)
157
		//				structsCountProf.put(key.toLowerCase(), value);
158
		//			}
159
		//		}
160
		HashMap<String, Integer> clone = new HashMap<String, Integer>();
160
		
161
		// if (firstGetstructsCountProf) {
162
		// firstGetstructsCountProf = false;
163
		// def keys = []
164
		// keys.addAll(structsCountProf.keySet());
165
		// for( String key : keys) {
166
		// def value = structsCountProf.get(key);
167
		// structsCountProf.remove(key)
168
		// structsCountProf.put(key.toLowerCase(), value);
169
		// }
170
		// }
171
		HashMap<String, Integer> clone = new HashMap<>();
161 172
		for (String key : structsMaxProf.keySet()) {
162 173
			if (structsMaxProf.get(key) > 0)
163
				clone.put(key, structsMaxProf.get(key)-1);
174
				clone.put(key, structsMaxProf.get(key) - 1);
164 175
			else
165 176
				clone.put(key, 0);
166 177
		}
167 178
		return clone;
168 179
	}
169

  
180
	
170 181
	public void initialize(ArrayList<String> pattributes, HashMap<String, HashSet<String>> sAttributesMap, HashMap<String, Integer> sAttributesProfs) {
171 182
		this.anatypes.addAll(pattributes);
172 183
		for (String s : sAttributesMap.keySet()) {
......
179 190
	public HashSet<String> getAnatypes() {
180 191
		return anatypes;
181 192
	}
182

  
183
//	public SAttributesListener scanFile(File xmlFile) throws MalformedURLException, IOException, XMLStreamException {
184
//		return scanFile(xmlFile, this);
185
//	}
186

  
193
	
194
	// public SAttributesListener scanFile(File xmlFile) throws MalformedURLException, IOException, XMLStreamException {
195
	// return scanFile(xmlFile, this);
196
	// }
197
	
187 198
	public void setParser(XMLStreamReader parser) {
188 199
		this.parser = parser;
189 200
	}
190

  
201
	
191 202
	/**
192 203
	 * Merge results in the parentListener
193 204
	 * 
194 205
	 * @param xmlFile
195 206
	 * @param parentListener results are appended to the parentListener if any
196 207
	 * @return
197
	 * @throws IOException 
198
	 * @throws MalformedURLException 
199
	 * @throws XMLStreamException 
208
	 * @throws IOException
209
	 * @throws MalformedURLException
210
	 * @throws XMLStreamException
200 211
	 */
201 212
	public SAttributesListener scanFile(File xmlFile) throws MalformedURLException, IOException, XMLStreamException {
202

  
213
		
203 214
		boolean startText = false;
204 215
		boolean startWord = false;
205 216
		InputStream inputData = xmlFile.toURI().toURL().openStream();
206 217
		XMLInputFactory factory = XMLInputFactory.newInstance();
207 218
		XMLStreamReader parser = factory.createXMLStreamReader(inputData);
208

  
209
//		SAttributesListener listener;
210
//		if (parentListener != null) {
211
//			listener = parentListener;
212
//			listener.setParser(parser);
213
//		} else {
214
//			listener = new SAttributesListener(parser);
215
//		}
219
		
220
		// SAttributesListener listener;
221
		// if (parentListener != null) {
222
		// listener = parentListener;
223
		// listener.setParser(parser);
224
		// } else {
225
		// listener = new SAttributesListener(parser);
226
		// }
216 227
		String TEXT = "text";
217 228
		String ANA = "ana";
218 229
		String TYPE = "type";
219
		//HashSet<String> types = new HashSet<String>();
230
		// HashSet<String> types = new HashSet<String>();
220 231
		this.setParser(parser);
221 232
		for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
222 233
			if (event == XMLStreamConstants.START_ELEMENT) { // start elem
223 234
				if (TEXT.equals(parser.getLocalName())) startText = true;
224

  
235
				
225 236
				if (startText) this.startElement(parser.getLocalName());
226

  
237
				
227 238
				if (this.W.equals(parser.getLocalName())) {
228 239
					startWord = true;
229
				} else if (startWord && ANA.equals(parser.getLocalName())) { // ana elem
230
					for (int i = 0 ; i < parser.getAttributeCount(); i++) { // find @type
240
				}
241
				else if (startWord && ANA.equals(parser.getLocalName())) { // ana elem
242
					for (int i = 0; i < parser.getAttributeCount(); i++) { // find @type
231 243
						if (TYPE.equals(parser.getAttributeLocalName(i))) { // @type
232
							this.anatypes.add(parser.getAttributeValue(i).substring(1)); //remove the #
244
							this.anatypes.add(parser.getAttributeValue(i).substring(1)); // remove the #
233 245
							break;
234 246
						}
235 247
					}
236 248
				}
237
			} else if (event == XMLStreamConstants.END_ELEMENT) { // end elem
249
			}
250
			else if (event == XMLStreamConstants.END_ELEMENT) { // end elem
238 251
				if (startText) this.endElement(parser.getLocalName());
239 252
				if (TEXT.equals(parser.getLocalName())) startText = false;
240

  
253
				
241 254
				if (this.W.equals(parser.getLocalName())) {
242 255
					startWord = false;
243 256
				}
......
245 258
		}
246 259
		if (parser != null) parser.close();
247 260
		if (inputData != null) inputData.close();
248

  
261
		
249 262
		return this;
250 263
	}
251

  
264
	
252 265
	/**
253 266
	 * scan the XML files of a directory to list the structures with their properties and levels. Also list the word properties
267
	 * 
254 268
	 * @param xmlDirectory
255 269
	 * @param wordTag
256 270
	 * @return
257
	 * @throws XMLStreamException 
258
	 * @throws IOException 
259
	 * @throws MalformedURLException 
271
	 * @throws XMLStreamException
272
	 * @throws IOException
273
	 * @throws MalformedURLException
260 274
	 */
261 275
	public static SAttributesListener scanFiles(File xmlDirectory, String wordTag) throws MalformedURLException, IOException, XMLStreamException {
262 276
		SAttributesListener listener = new SAttributesListener();
......
264 278
		for (File xmlFile : xmlDirectory.listFiles(IOUtils.HIDDENFILE_FILTER)) {
265 279
			if (xmlFile.isFile() && !xmlFile.isHidden() && xmlFile.getName().toLowerCase().endsWith(".xml")) {
266 280
				listener.scanFile(xmlFile); // results saved in 'listener' data
267
				//				println "LISTENER RESULT with ${xmlFile.getName()}: "+listener
268
				//				println " prof: "+listener.getStructs()
269
				//				println " prof: "+listener.getProfs()
270
				//				println " path: "+listener.structPath
281
				// println "LISTENER RESULT with ${xmlFile.getName()}: "+listener
282
				// println " prof: "+listener.getStructs()
283
				// println " prof: "+listener.getProfs()
284
				// println " path: "+listener.structPath
271 285
			}
272 286
		}
273

  
287
		
274 288
		return listener;
275 289
	}
276 290
}

Formats disponibles : Unified diff