Révision 4025

TXM/trunk/bundles/org.txm.tigersearch.rcp/src/org/txm/searchengine/ts/TSCorpus.java (revision 4025)
219 219
	 * @param tigerPosition
220 220
	 * @return 1 if the TIGER position has a position in CQP ; 0 if not
221 221
	 */
222
	public int getPresence(int tigerPosition) {
222
	public byte getPresence(int tigerPosition) {
223 223
		if (presencesMapped != null) {
224
			return presencesMapped.getInt(tigerPosition * Integer.BYTES);
224
			return presencesMapped.get(tigerPosition);
225 225
		}
226 226
		else {
227 227
			return 0;
......
233 233
	 * @param tigerPositions
234 234
	 * @return 1 if the TIGER position has a position in CQP ; 0 if not
235 235
	 */
236
	public int[] getPresences(int tigerPositions[]) {
237
		int[] ret = new int[tigerPositions.length];
236
	public byte[] getPresences(int tigerPositions[]) {
237
		byte[] ret = new byte[tigerPositions.length];
238 238
		if (presencesMapped != null) {
239 239
			for (int i = 0; i < tigerPositions.length; i++) {
240
				ret[i] = presencesMapped.getInt(tigerPositions[i] * Integer.BYTES);
240
				ret[i] = presencesMapped.get(tigerPositions[i] * Integer.BYTES);
241 241
			}
242 242
		}
243 243
		
TXM/trunk/bundles/org.txm.tigersearch.rcp/src/org/txm/searchengine/ts/TIGERSearchEngine.java (revision 4025)
42 42
import org.txm.utils.io.IOUtils;
43 43
import org.txm.utils.logger.Log;
44 44

  
45
import cern.colt.Arrays;
45 46
import ims.tiger.corpus.Feature;
46 47
import ims.tiger.corpus.Header;
47 48
import ims.tiger.corpus.Sentence;
......
258 259
		int iPivot = variables.indexOf("pivot"); //$NON-NLS-1$
259 260

  
260 261
		MappedByteBuffer offsetsMapped = tcorpus.getOffsetsMapped();
262
		MappedByteBuffer presenceMapped = tcorpus.getPresencesMapped();
261 263
		// MappedByteBuffer offsetsMapped = tcorpus.getOffsetsMapped();
264
		
265
		ArrayList<String> warnings = new ArrayList<String>();
262 266

  
263 267
		boolean useSubMatches = TigerSearchTreePreferences.getInstance().getBoolean(TigerSearchTreePreferences.USESUBMATCHES);
264 268

  
......
282 286
					if (iPivot != -1 && i != iPivot) continue; // skip match that are not 'pivot'
283 287

  
284 288
					int left = sent_start + index.getLeftCorner(sent, match[i]);
285
					if (offsetsMapped != null) { // the TIGER token is not in the CQP corpus
286
						left += offsetsMapped.getInt(left * Integer.BYTES);
287
						// System.out.println("left="+left+" offset="+offsetsMapped.getInt(left*Integer.BYTES));
288
					}
289 289
					int right = sent_start + index.getRightCorner(sent, match[i]);
290
					if (offsetsMapped != null) { // the TIGER token is not in the CQP corpus
291
						right += offsetsMapped.getInt(right * Integer.BYTES);
292
					}
293
					// System.out.println(" M="+match[i]+" ("+left+", "+right+")");
290
					
291
					// test if the match position is also in the CQP positions
292
					if (presenceMapped.get(left) > 0  && presenceMapped.get(right) > 0) {
294 293

  
295
					TIGERMatch tigerMatch = new TIGERMatch(left, right);
294
						if (offsetsMapped != null && presenceMapped != null) { // the TIGER token is not in the CQP corpus
295
							if (presenceMapped.get(left) > 0) {
296
								left += offsetsMapped.getInt(left * Integer.BYTES);
297
							}
298
							// System.out.println("left="+left+" offset="+offsetsMapped.getInt(left*Integer.BYTES));
299
						}
296 300

  
297
					// System.out.println(" ajusted="+(tigerMatch));
298
					tigerMatchesList.add(tigerMatch);
301
						if (offsetsMapped != null && presenceMapped != null) { // the TIGER token is not in the CQP corpus
302
							if (presenceMapped.get(right) > 0) {
303
								right += offsetsMapped.getInt(right * Integer.BYTES);
304
							}
305
						}
306
						// System.out.println(" M="+match[i]+" ("+left+", "+right+")");
299 307

  
300
					if (!useSubMatches) { // use only the first submatch
301
						break;
308
						TIGERMatch tigerMatch = new TIGERMatch(left, right);
309

  
310
						// System.out.println(" ajusted="+(tigerMatch));
311
						tigerMatchesList.add(tigerMatch);
312

  
313
						if (!useSubMatches) { // use only the first submatch
314
							break;
315
						}
316
					} else {
317
						warnings.add("<"+left+", "+right+">");
302 318
					}
303 319
				}
304 320
			}
305 321
		}
322
		
323
		if (warnings.size() > 0) {
324
			Log.warning("Some TIGER matches are not in the CQP corpus: "+StringUtils.join(warnings, ", "));
325
		}
306 326

  
307 327
		// intersect with corpus matches
308 328
		List<? extends Match> result2 = Match.intersect(corpus.getMatches(), new ArrayList<>(tigerMatchesList), true);
......
526 546
			int[] ids_idx = CQI.str2Id(corpus.getProperty("id").getQualifiedName(), ids); //$NON-NLS-1$
527 547
			Integer[] cqpPositions = new Integer[sent_size];
528 548
			Integer[] offsets = new Integer[sent_size];
549
			boolean error = false;
529 550
			for (int t = 0; t < sent_size; t++) {
530 551
				if (ids_idx[t] >= 0) {
531 552
					int[] positions = CQI.id2Cpos(corpus.getProperty("id").getQualifiedName(), ids_idx[t]); //$NON-NLS-1$
......
536 557
				}
537 558
				else { // word not in the CQP corpus
538 559
					Log.warning("Could not find word for id=" + ids[t]);
560

  
539 561
					cqpPositions[t] = null;
562
					error = true;
540 563
				}
541 564

  
542 565
				if (cqpPositions[t] != null) {
......
546 569
					offsets[t] = null;
547 570
				}
548 571
			}
572
			if (error) {
573
				Log.warning("	IDS      =" + " "+ids.length+" "+Arrays.toString(ids));
574
				Log.warning("	IDS_IDX  =" + " "+ids_idx.length+" "+Arrays.toString(ids_idx));
575
				Log.warning("	CQP      =" + " "+cqpPositions.length+" "+Arrays.toString(cqpPositions));
576
				Log.warning("	TIGER    =" + " "+tigerPositions.length+" "+Arrays.toString(tigerPositions));
577
				Log.warning("	OFFSET   =" + " "+offsets.length+" "+Arrays.toString(offsets));
578
			}
549 579
			// System.out.println("ids="+Arrays.toString(ids));
550 580
			// System.out.println("cqp indexes="+Arrays.toString(ids_idx));
551 581
			// System.out.println("tiger positions="+Arrays.toString(tigerPositions));
TXM/trunk/bundles/org.txm.conllu.core/groovy/org/txm/scripts/importer/conllu/CoNLLUImporter.groovy (revision 4025)
64 64
		files.sort()
65 65

  
66 66
		println "Add XmlId if necessary & remove empty nodes"
67

  
68
		String contractionsManagement =  UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.CONTRACTIONS_MANAGEMENT, UDPreferences.getInstance().getString(UDPreferences.CONTRACTIONS_MANAGEMENT));
69

  
67 70
		ConsoleProgressBar cpb_texts = new ConsoleProgressBar(files.size())
68 71
		for (File conlluFile : files) {
69 72
			cpb_texts.tick()
......
85 88
						continue; // next !
86 89
					}
87 90
				}
91
				
88 92
				def temp_multiwords = [:]
89

  
90 93
				for (int i = 0 ; i < lines.size() ; i++) {
91 94
					String line = lines[i]
92 95
					if (line.length() == 0 || line.startsWith("#") || !line.contains("\t")) continue;
93 96

  
94 97
					def split = line.split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length);
95 98

  
96
					if (split[-1] != null && !split[-1].contains("XmlId=")) {
99
					if (temp_multiwords.containsKey(split[0])) { // this word XMLid must be the same as its multiword id, see below
100
						String id = temp_multiwords.remove(split[0]);
97 101
						if (split[-1] == "_") {
98
							split[-1] = "XmlId=w_"+textid+"_"+(wcounter++)
102
							split[-1] = "XmlId="+id
99 103
						} else {
100
							split[-1] += "|XmlId=w_"+textid+"_"+(wcounter++)
104
							split[-1] += "|XmlId="+id
101 105
						}
106
					} else {
107

  
108
						if (split[-1] != null && !split[-1].contains("XmlId=")) { // There is no XmlID -> create one and manage subwords
109
							String id = "w_"+textid+"_"+(wcounter++);
110
							if (split[-1] == "_") {
111
								split[-1] = "XmlId="+id
112
							} else {
113
								split[-1] += "|XmlId="+id
114
							}
115
							
116
							if (split[0].contains("-") && contractionsManagement == "surface") {
117
								temp_multiwords = [:] // reset to avoid using another multiwords 
118
								String[] fromstart= split[0].split("-", 2)
119
								int pfrom = Integer.parseInt(fromstart[0])
120
								int pend = Integer.parseInt(fromstart[1])
121
								for (int p = pfrom ; p <= pend ; p++) {
122
									temp_multiwords.put(""+p, id)
123
								}
124
								println temp_multiwords
125
							}
126
						}
102 127
					}
103 128

  
104 129
					lines[i] = split.join("\t") // rebuild the line
......
111 136

  
112 137
		// Keep or not contractions
113 138
		File conlluSrcForTXMDirectory = new File(outputDirectory.getParentFile().getParentFile(), "conllu-fortxm")
114
		
115
		String contractionsManagement =  UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.CONTRACTIONS_MANAGEMENT, UDPreferences.getInstance().getString(UDPreferences.CONTRACTIONS_MANAGEMENT));
116 139

  
117 140
		if (contractionsManagement == UDPreferences.ALL) {
118 141
			conlluSrcForTXMDirectory = conlluSrcDirectory; // use the same directory as TIGER since no word modifications have been done
119 142
		} else {
120
			
143

  
121 144
			conlluSrcForTXMDirectory.deleteDir()
122 145
			conlluSrcForTXMDirectory.mkdirs()
123
			
124
			println "Contractions managment mode is '$contractionsManagement'"
146

  
147
			println "Contractions management mode is '$contractionsManagement'"
125 148
			cpb_texts = new ConsoleProgressBar(files.size())
126 149
			for (File conlluFile : files) {
127 150
				cpb_texts.tick()
......
142 165
						if (line.length() == 0 || line.startsWith("#") || !line.contains("\t")) continue;
143 166

  
144 167
						def split = line.split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length);
145

  
168
						
146 169
						if (contractionsManagement == UDPreferences.SYNTAX) {
147 170
							if (split[0].contains("-")) {
148 171

  
149 172
								// stores the syntatic word id and the ortographic word properties
150
								temp_multiwords = [:]
173
								temp_multiwords = [:] // reset to avoid using another multiwords 
151 174
								int n1 = Integer.parseInt(split[0].substring(0,  split[0].indexOf("-")));
152 175
								int n2 = Integer.parseInt(split[0].substring(1 + split[0].indexOf("-")));
153 176
								for (int ii = n1 ; ii <= n2 ; ii++) {
TXM/trunk/bundles/org.txm.conllu.core/src/org/txm/conllu/core/function/BratPrintTree.java (revision 4025)
30 30
					int n2 = Integer.parseInt(split[0].substring(1 + split[0].indexOf("-")));
31 31
					int n =  n2 - n1;
32 32
					
33
					//System.out.println("Word "+Arrays.toString(split));
34
					//System.out.println("lines to insert: "+n);
35 33
					if ( !(splittedLines.get(i+1)[0].equals(""+n1)) || !(splittedLines.get(i+n+1)[0].equals(""+n2)) ) {
36 34
						
37 35
						ArrayList<String[]> newlines = new ArrayList<>();
......
73 71
					} else {
74 72
						//System.out.println("NOT FIXING "+conll.get(i));
75 73
					}
76
					
77
					
78 74
				}
79 75
			}
80 76
			
TXM/trunk/bundles/org.txm.treetagger.core/META-INF/MANIFEST.MF (revision 4025)
5 5
Bundle-SymbolicName: org.txm.treetagger.core;singleton:=true
6 6
Bundle-Version: 1.0.0.qualifier
7 7
Bundle-Name: TreeTagger Core
8
Require-Bundle: org.txm.nlp.core;bundle-version="1.0.0",
9
 org.txm.core
8
Require-Bundle: org.txm.core,
9
 org.txm.nlp.core;bundle-version="1.0.0"
10 10
Bundle-ActivationPolicy: lazy
11 11
Bundle-ManifestVersion: 2
12 12
Bundle-RequiredExecutionEnvironment: JavaSE-16
TXM/trunk/bundles/org.txm.udpipe.core/plugin.xml (revision 4025)
2 2
<?eclipse version="3.4"?>
3 3
<plugin>
4 4
   <extension
5
         point="org.txm.annotation.core.AnnotationEngine">
5
         point="org.txm.nlp.core.NLPEngine">
6 6
      <AnnotationEngine
7 7
            class="org.txm.udpipe.core.UDPipeEngine"
8 8
            description="UDPipe wrapper">

Formats disponibles : Unified diff