Revision 2936

tmp/org.txm.tigersearch.rcp/src/org/txm/tigersearch/commands/ImportCONNLUAnnotations.java (revision 2936)
42 42
import org.eclipse.ui.handlers.HandlerUtil;
43 43
import org.kohsuke.args4j.Option;
44 44
import org.txm.core.messages.TXMCoreMessages;
45
import org.txm.objects.Text;
45 46
import org.txm.rcp.commands.workspace.UpdateCorpus;
46 47
import org.txm.rcp.swt.widget.parameters.ParametersDialog;
47 48
import org.txm.searchengine.cqp.clientExceptions.CqiClientException;
......
139 140
			
140 141
			String textid = coonluFile.getName().substring(0, coonluFile.getName().length() - 7);
141 142
			Log.info("** processing text: " + textid);
143
			Text text = mainCorpus.getProject().getText(textid);
144
			if (text == null) {
145
				Log.warning("No text found with ID=" + textid);
146
				continue;
147
			}
142 148
			File xmltxmFile = mainCorpus.getProject().getText(textid).getXMLTXMFile();
143 149
			File xmltxmUpdatedFile = new File(System.getProperty("java.io.tmpdir"), xmltxmFile.getName());
144 150
			
......
180 186
				
181 187
				HashMap<String, String> properties = new HashMap<>();
182 188
				for (int i = 0; i < split.length; i++) {
183
					properties.put(propertiesPrefix + propNames[i], split[i]); // add the property name using the prefix
189
					properties.put("#" + propertiesPrefix + propNames[i], split[i]); // add the property name using the prefix ; XML-TXM types are prefixed with '#'
184 190
				}
185 191
				
186 192
				processor.addProperty(id, properties);
tmp/org.txm.tigersearch.rcp/src/org/txm/tigersearch/commands/ExportCorpusAsCONNLU.java (revision 2936)
29 29

  
30 30
import java.io.File;
31 31
import java.io.IOException;
32
import java.io.RandomAccessFile;
33
import java.nio.MappedByteBuffer;
34
import java.nio.channels.FileChannel;
32
import java.io.PrintWriter;
33
import java.util.ArrayList;
34
import java.util.HashMap;
35 35

  
36
import org.apache.log4j.BasicConfigurator;
36
import org.apache.commons.lang.StringUtils;
37 37
import org.eclipse.core.commands.AbstractHandler;
38 38
import org.eclipse.core.commands.ExecutionEvent;
39 39
import org.eclipse.core.commands.ExecutionException;
40
import org.eclipse.jface.dialogs.MessageDialog;
41 40
import org.eclipse.jface.viewers.IStructuredSelection;
42
import org.eclipse.swt.SWT;
43
import org.eclipse.swt.widgets.DirectoryDialog;
44 41
import org.eclipse.ui.handlers.HandlerUtil;
45
import org.txm.searchengine.cqp.AbstractCqiClient;
42
import org.kohsuke.args4j.Option;
43
import org.txm.rcp.swt.widget.parameters.ParametersDialog;
46 44
import org.txm.searchengine.cqp.CQPSearchEngine;
47 45
import org.txm.searchengine.cqp.clientExceptions.CqiClientException;
46
import org.txm.searchengine.cqp.clientExceptions.InvalidCqpIdException;
48 47
import org.txm.searchengine.cqp.clientExceptions.UnexpectedAnswerException;
49 48
import org.txm.searchengine.cqp.corpus.CQPCorpus;
50 49
import org.txm.searchengine.cqp.corpus.MainCorpus;
50
import org.txm.searchengine.cqp.corpus.WordProperty;
51 51
import org.txm.searchengine.cqp.serverException.CqiServerError;
52
import org.txm.searchengine.ts.InternalCorpusQueryManagerLocal2;
53
import org.txm.searchengine.ts.TSCorpus;
54
import org.txm.searchengine.ts.TSCorpusManager;
55
import org.txm.utils.ConsoleProgressBar;
56
import org.txm.utils.DeleteDir;
57
import org.txm.utils.io.FileCopy;
58 52
import org.txm.utils.io.IOUtils;
59 53
import org.txm.utils.logger.Log;
60 54

  
61
import ims.tiger.corpus.Sentence;
62
import ims.tiger.corpus.T_Node;
63
import ims.tiger.index.reader.Index;
64 55
import ims.tiger.index.reader.IndexException;
65
import ims.tiger.index.writer.IndexBuilderErrorHandler;
66
import ims.tiger.index.writer.SimpleErrorHandler;
67
import ims.tiger.index.writer.XMLIndexing;
68 56
import ims.tiger.query.api.QueryIndexException;
69
import ims.tiger.query.processor.CorpusQueryProcessor;
70 57

  
71 58
/**
72
 * Import TIGERSearch annotations into a TXM corpus
59
 * Export the connlu properties and CQP words into a connlu corpus of several files (one per text)
73 60
 * 
74
 * IF the corpus alreasy wontains TIGER annotations, they are replaced
75
 * 
76
 * The annotations are given using a TIGERSEarch binary corpus OR a TIGER source directory using a "main.xml" file
77
 * 
78 61
 * @author mdecorde.
79 62
 */
80 63
public class ExportCorpusAsCONNLU extends AbstractHandler {
81 64
	
82
	public static final String ID = "org.txm.rcp.commands.function.ComputeTSIndex"; //$NON-NLS-1$
65
	public static final String ID = ExportCorpusAsCONNLU.class.getName();
83 66
	
67
	@Option(name = "connluResultDirectory", usage = "connluResultDirectory", widget = "Folder", required = true, def = "connlu-result-directory")
68
	File connluResultDirectory;
69
	
70
	@Option(name = "propertiesPrefix", usage = "optional prefix for the properties to create", widget = "String", required = true, def = "ud")
71
	String propertiesPrefix;
72
	
73
	// @Option(name = "useUDForms", usage = "use the ud form property instead of CQP 'word' property", widget = "Boolean", required = true, def = "true")
74
	Boolean useUDForms = true;
75
	
76
	// @Option(name = "transfertAllWords", usage = "Transfert word not initially in the Connlu corpus", widget = "Boolean", required = true, def = "true")
77
	Boolean transfertAllWords = true;
78
	
79
	// @Option(name = "detectGap", usage = "Insert gap comment using the CQP 'gap' property", widget = "Boolean", required = true, def = "true")
80
	Boolean detectGap = false;
81
	
82
	@Option(name = "formCorrPropertyName", usage = "optional CQP property to fix the missing 'form' ud property", widget = "String", required = false, def = "")
83
	String formCorrPropertyName;
84
	
85
	@Option(name = "lemmaCorrPropertyName", usage = "optional CQP property to fix the missing 'lemma' ud property", widget = "String", required = false, def = "")
86
	String lemmaCorrPropertyName;
87
	
88
	@Option(name = "uposCorrPropertyName", usage = "optional CQP property to fix the missing 'upos' ud property", widget = "String", required = false, def = "")
89
	String uposCorrPropertyName;
90
	
91
	@Option(name = "xposCorrPropertyName", usage = "optional CQP property to fix the missing 'xpos' ud property", widget = "String", required = false, def = "")
92
	String xposCorrPropertyName;
93
	
94
	public static String[] propNames = { "id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc" };
95
	
84 96
	/*
85 97
	 * (non-Javadoc)
86 98
	 * @see org.eclipse.core.commands.AbstractHandler#execute(org.eclipse.core.commands.ExecutionEvent)
......
91 103
		IStructuredSelection selection = (IStructuredSelection) HandlerUtil.getCurrentSelection(event);
92 104
		
93 105
		Object s = selection.getFirstElement();
94
		if (s instanceof MainCorpus) {
95
			CQPCorpus corpus = (CQPCorpus) s;
96
			MainCorpus mainCorpus = corpus.getMainCorpus();
97
			
98
			File tigerCorpusDirectory = null;
99
			DirectoryDialog dialog = new DirectoryDialog(HandlerUtil.getActiveShell(event), SWT.OPEN);
100
			String path = dialog.open();
101
			if (path == null) {
102
				Log.warning("Aborting annotation importation.");
103
				return null;
104
			}
105
			else {
106
				tigerCorpusDirectory = new File(path);
107
			}
108
			
109
			File tigerDirectory = new File(mainCorpus.getProjectDirectory(), "tiger");
110
			File tigerCorpusExistingDirectory = new File(tigerDirectory, tigerCorpusDirectory.getName());
111
			if (tigerCorpusExistingDirectory.exists()) {
112
				boolean doIt = MessageDialog.openConfirm(HandlerUtil.getActiveShell(event), "Replace existing annotations", "TIGERSearch annotations already exists, replace them ?");
113
				if (!doIt) {
114
					Log.warning("Aborting annotation importation.");
115
					return null;
116
				}
117
			}
118
			
119
			if (new File(tigerCorpusDirectory, "word.lexicon").exists() && new File(tigerCorpusDirectory, "corpus_config.xml").exists()) {
120
				// ok this is a TIGERSearch binary corpus
121
			}
122
			else {
123
				
124
				// need to build a TIGERSearch binary corpus
125
				File tigerBinaryCorpusDirectory = new File(tigerCorpusDirectory, "tiger");
126
				if (!buildTIGERCorpus(mainCorpus, tigerCorpusDirectory, tigerBinaryCorpusDirectory)) {
127
					Log.warning("Aborting annotation importation.");
128
					return null;
129
				}
130
				tigerCorpusDirectory = new File(tigerBinaryCorpusDirectory, corpus.getName());
131
			}
132
			
133
			try {
134
				return importAnnotations(mainCorpus, tigerCorpusDirectory, "editionId");
135
			}
136
			catch (Exception e) {
137
				e.printStackTrace();
138
				return null;
139
			}
140
		}
141
		else {
106
		if (!(s instanceof MainCorpus)) {
142 107
			Log.warning("Selection is not a corpus. Aborting.");
143 108
			return null;
144 109
		}
145
	}
146
	
147
	private boolean buildTIGERCorpus(MainCorpus corpus, File sourceDirectory, File tigerDir) {
148
		tigerDir.mkdirs();
149 110
		
150
		File configfile = new File(tigerDir, "tigersearch.logprop");
151
		if (!configfile.exists()) {
152
			TSCorpus.createLogPropFile(tigerDir);
111
		ParametersDialog.open(this);
112
		connluResultDirectory.mkdirs();
113
		if (connluResultDirectory == null || !connluResultDirectory.exists() || !connluResultDirectory.isDirectory()) {
114
			Log.warning("Error: connlu result directory does not exists: " + connluResultDirectory);
115
			return null;
153 116
		}
154 117
		
155
		BasicConfigurator.configure();
156
		File master = new File(sourceDirectory, "main.xml");
157
		if (!master.exists()) master = new File(sourceDirectory, "master.xml");
118
		CQPCorpus corpus = (CQPCorpus) s;
119
		MainCorpus mainCorpus = corpus.getMainCorpus();
158 120
		
159
		if (!master.exists()) {
160
			Log.warning("Error: Can't create TIGERSearch corpus: no main or master file found in " + sourceDirectory);
161
			return false;
162
		}
163
		String uri = master.getAbsolutePath(); // TIGER corpus source root file
164
		File tigerBinDir = new File(tigerDir, corpus.getName());
165
		tigerBinDir.mkdirs();
166 121
		try {
167
			IndexBuilderErrorHandler handler = new SimpleErrorHandler(tigerBinDir.getAbsolutePath()) {
168
				
169
				@Override
170
				public void setMessage(String message) {}
171
				
172
				@Override
173
				public void setNumberOfSentences(int number) {}
174
				
175
				@Override
176
				public void setProgressBar(int value) {}
177
			};
178
			
179
			XMLIndexing indexing = new XMLIndexing(corpus.getName(), uri, tigerBinDir.getAbsolutePath(), handler, false);
180
			
181
			indexing.startIndexing();
182
			
183
			File logs = new File(tigerBinDir, "indexing.log");
184
			
185
			String txt = IOUtils.getText(logs);
186
			if (txt.contains("Error in corpus graph ")) {
187
				Log.warning("Error while importing TIGER corpus: " + txt);
188
				return false;
189
			}
122
			return exportAnnotationsAsCorpus(mainCorpus, connluResultDirectory, propertiesPrefix, formCorrPropertyName, lemmaCorrPropertyName, uposCorrPropertyName, xposCorrPropertyName, detectGap);
190 123
		}
191 124
		catch (Exception e) {
192
			System.out.println(e.getMessage());
193
			return false;
125
			Log.warning(e);
126
			e.printStackTrace();
194 127
		}
195
		return true;
128
		
129
		return null;
196 130
	}
197 131
	
198 132
	/**
......
208 142
	 * @throws CqiServerError
209 143
	 * @throws IOException
210 144
	 * @throws UnexpectedAnswerException
145
	 * @throws InvalidCqpIdException
211 146
	 */
212
	public static int importAnnotations(MainCorpus corpus, File tigerCorpusDirectory, String wordIdAttribute) throws IndexException, QueryIndexException, UnexpectedAnswerException, IOException,
147
	public static int exportAnnotationsAsCorpus(MainCorpus mainCorpus, File conlluResultDirectory, String prefix, String formCorrPropertyName, String lemmaCorrPropertyName,
148
			String uposCorrPropertyName, String xposCorrPropertyName, boolean detectGap) throws UnexpectedAnswerException,
149
			IOException,
213 150
			CqiServerError,
214
			CqiClientException {
151
			CqiClientException, InvalidCqpIdException {
215 152
		
216
		// TXM corpus files
217
		File tigerDirectory = new File(corpus.getProjectDirectory(), "tiger");
218
		File tigerCorpusExistingDirectory = new File(tigerDirectory, corpus.getName());
219
		DeleteDir.deleteDirectory(tigerCorpusExistingDirectory);
220
		tigerCorpusExistingDirectory.mkdirs();
221
		
222
		File configfile = new File(tigerDirectory, "tigersearch.logprop");
223
		if (!configfile.exists()) {
224
			TSCorpus.createLogPropFile(tigerDirectory);
153
		if (!conlluResultDirectory.exists()) {
154
			conlluResultDirectory.mkdirs();
225 155
		}
156
		int numberOfWordsWritten = 0;
157
		int numberOfSentencesWritten = 0;
158
		int numberOfTextsWritten = 0;
226 159
		
227
		AbstractCqiClient CQI = CQPSearchEngine.getCqiClient();
160
		String[] textIds = mainCorpus.getCorpusTextIdsList();
161
		int[] start_limits = mainCorpus.getTextStartLimits();
162
		int[] end_limits = mainCorpus.getTextEndLimits();
228 163
		
229
		TSCorpusManager manager = new TSCorpusManager(tigerCorpusDirectory.getParentFile(), configfile);
230
		
231
		TSCorpus tcorpus = manager.getCorpus(tigerCorpusDirectory.getName());
232
		InternalCorpusQueryManagerLocal2 tigermanager = tcorpus.manager;
233
		CorpusQueryProcessor processor = tigermanager.getQueryProcessor();
234
		
235
		Index index = processor.getIndex();
236
		int size = 0;
237
		for (int nr = 0; nr < index.getNumberOfGraphs(); nr++) {
238
			size += index.getNumberOfTNodes(nr);
239
		}
240
		
241
		if (size == 0) {
242
			Log.warning("No word found in the TIGERSearch corpus: " + tigerCorpusDirectory + ". Aborting.");
243
			return 0;
244
		}
245
		
246
		Log.info("Importing " + size + " word annotations...");
247
		
248
		// compute start position of sentences
249
		int[] starts = new int[index.getNumberOfGraphs()];
250
		for (int i = 0; i < index.getNumberOfGraphs(); i++) {
251
			starts[i] = 0;
252
			if (i > 0) {
253
				starts[i] += index.getNumberOfTNodes(i - 1) + starts[i - 1];
164
		for (String p : propNames) {
165
			WordProperty wp = mainCorpus.getProperty(prefix + p);
166
			if (wp == null) {
167
				Log.warning("Error: cannot find the Conllu property: " + prefix + p);
168
				return 0;
254 169
			}
255 170
		}
256 171
		
257
		File offsetsFile = new File(tigerCorpusExistingDirectory, "offsets.data");
258
		RandomAccessFile offsetsRAFile = new RandomAccessFile(offsetsFile, "rw");
259
		FileChannel offsetsFileChannel = offsetsRAFile.getChannel();
260
		MappedByteBuffer offsetsMapped = offsetsFileChannel.map(FileChannel.MapMode.READ_WRITE, 0, size * Integer.BYTES);
261
		// out.putInt(positions[i])
262
		
263
		File presencesFile = new File(tigerCorpusExistingDirectory, "presences.data");
264
		RandomAccessFile presencesRAFile = new RandomAccessFile(presencesFile, "rw");
265
		FileChannel presencesFileChannel = presencesRAFile.getChannel();
266
		MappedByteBuffer presencesMapped = presencesFileChannel.map(FileChannel.MapMode.READ_WRITE, 0, size);
267
		
268
		int numberOfWordsAnnotated = 0;
269
		
270
		// for each sentence
271
		ConsoleProgressBar cpb = new ConsoleProgressBar(index.getNumberOfGraphs());
272
		for (int nr = 0; nr < index.getNumberOfGraphs(); nr++) {
273
			cpb.tick();
274
			int sent_size = index.getNumberOfTNodes(nr);
275
			Sentence sent = tcorpus.manager.getSentence(nr);
172
		for (int iText = 0; iText < start_limits.length; iText++) {
276 173
			
277
			String[] ids = new String[sent_size];
278
			int[] tigerPositions = new int[sent_size];
279
			for (int t = 0; t < sent_size; t++) {
280
				T_Node terminal = (T_Node) sent.getTerminalAt(t);
281
				ids[t] = terminal.getFeature(wordIdAttribute);
282
				
283
				// try fixing ID
284
				if (ids[t].startsWith("w")) {
285
					if (!ids[t].startsWith("w_")) {
286
						ids[t] = "w_" + ids[t].substring(1);
174
			// Build corpus positions
175
			int[] positions = new int[end_limits[iText] - start_limits[iText] + 1];
176
			int tmp = 0;
177
			for (int n = start_limits[iText]; n <= end_limits[iText]; n++) {
178
				positions[tmp++] = n;
179
			}
180
			numberOfWordsWritten += positions.length;
181
			
182
			// Get UD properties
183
			WordProperty wp;
184
			wp = mainCorpus.getProperty(prefix + "id");
185
			String[] idValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions);
186
			
187
			WordProperty formWordProperty = mainCorpus.getProperty(prefix + "form");
188
			String[] formValues = CQPSearchEngine.getCqiClient().cpos2Str(formWordProperty.getQualifiedName(), positions);
189
			
190
			wp = mainCorpus.getProperty(prefix + "lemma");
191
			String[] lemmaValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions);
192
			
193
			wp = mainCorpus.getProperty(prefix + "upos");
194
			String[] uposValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions);
195
			
196
			wp = mainCorpus.getProperty(prefix + "xpos");
197
			String[] xposValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions);
198
			
199
			wp = mainCorpus.getProperty(prefix + "feats");
200
			String[] featsValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions);
201
			
202
			wp = mainCorpus.getProperty(prefix + "head");
203
			String[] headValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions);
204
			
205
			wp = mainCorpus.getProperty(prefix + "deprel");
206
			String[] deprelValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions);
207
			
208
			wp = mainCorpus.getProperty(prefix + "deps");
209
			String[] depsValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions);
210
			
211
			wp = mainCorpus.getProperty(prefix + "misc");
212
			String[] miscValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions);
213
			// print UD sentences
214
			
215
			
216
			// build sentence, first pass using UD word sentence positions
217
			ArrayList<ArrayList<Integer>> sentences = new ArrayList<>();
218
			ArrayList<Integer> tmpSentence = new ArrayList<>();
219
			for (int p = 0; p < positions.length; p++) {
220
				// System.out.println("p=" + p + " id=" + idValues[p] + " form=" + formValues[p] + " lemma=" + lemmaValues[p] + " upos=" + uposValues[p] + " xpos=" + xposValues[p] + " feats="
221
				// + featsValues[p] + " head="
222
				// + headValues[p] + " deprel=" + deprelValues[p] + " deps=" + depsValues[p] + " misc=" + miscValues[p]);
223
				if (idValues[p].equals("1")) {
224
					
225
					if (tmpSentence.size() > 0) {
226
						sentences.add(new ArrayList<>(tmpSentence));
287 227
					}
228
					
229
					// System.out.println("new sentence: " + " id=" + idValues[p] + " form=" + formValues[p] + " lemma=" + lemmaValues[p] + " upos=" + uposValues[p] + " xpos=" + xposValues[p] + "
230
					// feats="
231
					// + featsValues[p] + " head="
232
					// + headValues[p] + " deprel=" + deprelValues[p] + " deps=" + depsValues[p] + " misc=" + miscValues[p]);
233
					tmpSentence.clear();
288 234
				}
289
				else {
290
					ids[t] = "w_" + ids[t];
291
				}
292
				tigerPositions[t] = starts[nr] + t;
293
				// System.out.println("T id="+terminal.getID());
235
				
236
				tmpSentence.add(p);
294 237
			}
238
			positions = null; // free memory
295 239
			
296
			int[] ids_idx = CQI.str2Id(corpus.getProperty("id").getQualifiedName(), ids);
297
			Integer[] cqpPositions = new Integer[sent_size];
298
			Integer[] offsets = new Integer[sent_size];
299
			for (int t = 0; t < sent_size; t++) {
300
				if (ids_idx[t] >= 0) {
301
					int[] positions = CQI.id2Cpos(corpus.getProperty("id").getQualifiedName(), ids_idx[t]);
302
					if (positions.length > 1) {
303
						Log.warning("Warning: multiple CQP positions for word_id=" + ids[t]);
304
					}
305
					cqpPositions[t] = positions[0]; // take the first position
240
			// fixing sentences
241
			for (int s = 0; s < sentences.size(); s++) {
242
				// int c = 0;
243
				// ArrayList<Integer> sentence = sentences.get(s);
244
				// for (int ip = 0 ; ip < sentence.size() ; ip++) {
245
				//
246
				// int p = sentence.get(ip);
247
				//
248
				// if (idValues[p].equals("__UNDEF__")) {
249
				// c++;
250
				// }
251
				// }
252
				// if (c == 0) { // al is fine
253
				//
254
				// } else if (c )
255
			}
256
			
257
			if (tmpSentence.size() > 0) { // add last sentence
258
				sentences.add(new ArrayList<>(tmpSentence));
259
			}
260
			
261
			// fixing sentence __NULL__ ud properties
262
			for (int iSentence = 0; iSentence < sentences.size(); iSentence++) {
263
				ArrayList<Integer> sentence = sentences.get(iSentence);
264
				
265
				int[] sentencePositions = new int[sentence.size()];
266
				for (int p = 0; p < sentence.size(); p++)
267
					sentencePositions[p] = sentence.get(p);
268
				
269
				// get CQP values fixing "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps"
270
				String[] ids = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty("id").getQualifiedName(), sentencePositions);
271
				
272
				String[] words = null;
273
				if (formCorrPropertyName != null && formCorrPropertyName.length() > 0) {
274
					words = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(formCorrPropertyName).getQualifiedName(), sentencePositions);
306 275
				}
307
				else { // word not in the CQP corpus
308
					Log.warning("Could not find word for id=" + ids[t]);
309
					cqpPositions[t] = null;
276
				String[] lemmas = null;
277
				if (lemmaCorrPropertyName != null && lemmaCorrPropertyName.length() > 0) {
278
					lemmas = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(lemmaCorrPropertyName).getQualifiedName(), sentencePositions);
310 279
				}
280
				String[] upos = null;
281
				if (uposCorrPropertyName != null && uposCorrPropertyName.length() > 0) {
282
					upos = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(uposCorrPropertyName).getQualifiedName(), sentencePositions);
283
				}
284
				String[] xpos = null;
285
				if (xposCorrPropertyName != null && xposCorrPropertyName.length() > 0) {
286
					xpos = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(xposCorrPropertyName).getQualifiedName(), sentencePositions);
287
				}
311 288
				
312
				if (cqpPositions[t] != null) {
313
					offsets[t] = cqpPositions[t] - tigerPositions[t];
289
				// String[] feats = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(featsCorrPropertyName).getQualifiedName(), sentencePositions);
290
				// String[] head = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(headCorrPropertyName).getQualifiedName(), sentencePositions);
291
				// String[] deprel = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(deprelCorrPropertyName).getQualifiedName(), sentencePositions);
292
				// String[] deps = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(deprelCorrPropertyName).getQualifiedName(), sentencePositions);
293
				
294
				HashMap<String, String> sentIds = new HashMap<>();
295
				for (int ip = 0; ip < sentence.size(); ip++) {
296
					int p = sentence.get(ip);
297
					
298
					if (!idValues[p].equals("__UNDEF__")) { // store "old id -> new id"
299
						sentIds.put(idValues[p], "" + (ip + 1)); // from 1 to N
300
					}
301
					
302
					// new word
303
					if (miscValues[p].equals("__UNDEF__")) {
304
						miscValues[p] = "XmlId=" + ids[ip];
305
					}
306
					
307
					// { "id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc" };
308
					if (words != null && formValues[p].equals("__UNDEF__")) {
309
						formValues[p] = words[ip];
310
					}
311
					if (lemmas != null && lemmaValues[p].equals("__UNDEF__")) {
312
						lemmaValues[p] = lemmas[ip];
313
					}
314
					if (upos != null && uposValues[p].equals("__UNDEF__")) {
315
						uposValues[p] = upos[ip];
316
					}
317
					if (xpos != null && xposValues[p].equals("__UNDEF__")) {
318
						xposValues[p] = xpos[ip];
319
					}
314 320
				}
315
				else {
316
					offsets[t] = null;
321
				
322
				// fixing head and set missing head to 0 and root
323
				for (int ip = 0; ip < sentence.size(); ip++) {
324
					int p = sentence.get(ip);
325
					
326
					
327
					
328
					// fixing id value
329
					idValues[p] = "" + (ip + 1);  // from 1 to N
330
					
331
					// fixing head values
332
					if (sentIds.containsKey(headValues[p])) {
333
						headValues[p] = sentIds.get(headValues[p]);
334
					}
335
					else { // new word, set to default values
336
						headValues[p] = "0";
337
						deprelValues[p] = "root";
338
						depsValues[p] = "_";
339
					}
317 340
				}
318 341
			}
319
			// System.out.println("ids="+Arrays.toString(ids));
320
			// System.out.println("cqp indexes="+Arrays.toString(ids_idx));
321
			// System.out.println("tiger positions="+Arrays.toString(tigerPositions));
322
			// System.out.println("cqp positions="+Arrays.toString(cqpPositions));
323
			// System.out.println("offsets="+Arrays.toString(offsets));
324 342
			
325
			// writing data to offset and presences files
326
			for (int t = 0; t < sent_size; t++) {
343
			// writing sentences
344
			File resultConlluFile = new File(conlluResultDirectory, textIds[iText] + ".conllu");
345
			PrintWriter writer = IOUtils.getWriter(resultConlluFile);
346
			for (int iSentence = 0; iSentence < sentences.size(); iSentence++) {
347
				ArrayList<Integer> sentence = sentences.get(iSentence);
327 348
				
328
				if (offsets[t] != null) {
329
					numberOfWordsAnnotated++;
330
					presencesMapped.put((byte) 1);
331
					offsetsMapped.putInt(offsets[t]);
349
				int[] sentencePositions = new int[sentence.size()];
350
				for (int p = 0; p < sentence.size(); p++)
351
					sentencePositions[p] = sentence.get(p);
352
				
353
				String[] gap = null;
354
				if (detectGap && mainCorpus.getProperty("gap") != null) {
355
					gap = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty("gap").getQualifiedName(), sentencePositions);
332 356
				}
333
				else {
334
					presencesMapped.put((byte) 0);
335
					offsetsMapped.putInt(0);
357
				
358
				String[] tokens = new String[sentences.size()];
359
				for (int ip = 0; ip < sentence.size(); ip++) {
360
					tokens[ip] = formValues[sentence.get(ip)];
336 361
				}
362
				
363
				writer.println("# text = " + StringUtils.join(tokens, " "));
364
				writer.println("# newdoc id = " + textIds[iText]);
365
				writer.println("# sent_id = " + (iSentence + 1));
366
				
367
				for (int p : sentence) {
368
					
369
					if (gap != null && gap[p] != null) writer.println("# gap");
370
					
371
					// { "id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc" };
372
					writer.println(idValues[p] + "\t" + formValues[p] + "\t" + lemmaValues[p] + "\t" + uposValues[p]
373
							+ "\t" + xposValues[p] + "\t" + featsValues[p] + "\t" + headValues[p] + "\t" + deprelValues[p]
374
							+ "\t" + depsValues[p] + "\t" + miscValues[p]);
375
				}
376
				writer.println("");
377
				numberOfSentencesWritten++;
337 378
			}
379
			writer.close();
380
			
381
			numberOfTextsWritten++;
338 382
		}
339
		cpb.done();
340 383
		
341
		offsetsFileChannel.close();
342
		offsetsRAFile.close();
343
		presencesFileChannel.close();
344
		presencesRAFile.close();
384
		System.out.println("N words written: " + numberOfWordsWritten);
385
		System.out.println("N sentences written: " + numberOfSentencesWritten);
386
		System.out.println("N texts written: " + numberOfTextsWritten);
345 387
		
346
		Log.info("Finalizing TIGERSearch corpus");
347
		if (numberOfWordsAnnotated > 0) {
348
			FileCopy.copyFiles(tigerCorpusDirectory, tigerCorpusExistingDirectory);
349
			Log.info("Done. " + numberOfWordsAnnotated + " words annotated.");
350
		}
351
		else {
352
			Log.warning("Warning: no words could be aligned with the CQP corpus. Aborting");
353
		}
354
		
355
		return numberOfWordsAnnotated;
388
		return numberOfWordsWritten;
356 389
	}
357 390
}
tmp/org.txm.core/src/java/org/txm/xml/xmltxm/XMLTXMWordPropertiesInjection.java (revision 2936)
64 64
		super(infile);
65 65
	}
66 66
	
67
	/**
68
	 * @param id : the word identifier (id) to process.
69
	 * @param properties: hashmap of ana@type+ana@value to injected. Warning ana@type must be prefixed with "#"
70
	 * 
71
	 */
67 72
	public void addProperty(String id, HashMap<String, String> properties) throws IOException, XMLStreamException {
68 73
		if (rules != null) {
69 74
			rules.put(id, properties);
70 75
		}
71 76
	}
72 77
	
78
	/**
79
	 * @param rules the keys are the word identifiers (id) to process. the values are hashmap of ana@type+ana@value to injected. Warning ana@type must be prefixed with "#"
80
	 * 
81
	 */
73 82
	public void setProperties(HashMap<String, HashMap<String, String>> rules) throws IOException, XMLStreamException {
74 83
		
75 84
		this.rules = rules;
......
90 99
			
91 100
			ArrayList<String[]> formValues = new ArrayList<>();
92 101
			
93
			StringBuilder resp = new StringBuilder(), type = new StringBuilder(), value = new StringBuilder();
102
			StringBuilder value = new StringBuilder();
94 103
			
104
			String resp = "";
105
			
106
			String type = "";
107
			
95 108
			@Override
96 109
			public boolean deactivate() {
97 110
				return true;
......
118 131
					// store values
119 132
					inAna = true;
120 133
					
121
					resp.setLength(0);
122
					resp.append(parser.getAttributeValue(null, "resp"));
134
					resp = parser.getAttributeValue(null, "resp");
123 135
					
124
					type.setLength(0);
125
					type.append(parser.getAttributeValue(null, "type"));
136
					type = parser.getAttributeValue(null, "type");
126 137
					
127 138
					value.setLength(0);
128 139
					return; // write ana later
......
131 142
					// store values
132 143
					inForm = true;
133 144
					
134
					type.setLength(0);
145
					
135 146
					if (parser.getAttributeValue(null, "type") != null) {
136
						type.append(parser.getAttributeValue(null, "type"));
147
						type = parser.getAttributeValue(null, "type");
137 148
					}
149
					else {
150
						type = "";
151
					}
138 152
					
139 153
					value.setLength(0);
140 154
					return; // write form later
......
165 179
					// update ana values
166 180
					HashMap<String, String> values = rules.get(id);
167 181
					
168
					for (String[] l : anaValues) { // update existing values
182
					for (String[] l : anaValues) { // update existing txm:ana and remove the updated value from 'values'
169 183
						if (values.containsKey(l[0])) {
170 184
							l[2] = values.get(l[0]);
171 185
							values.remove(l[0]);
172 186
						}
173 187
					}
174 188
					
175
					for (Entry<String, String> e : values.entrySet()) { // create new values
176
						anaValues.add(new String[] { "#" + e.getKey(), "#txm", e.getValue() });
189
					for (Entry<String, String> e : values.entrySet()) { // create new values (for remaining values of 'values'
190
						anaValues.add(new String[] { e.getKey(), "#txm", e.getValue() }); // the ana type is already prefixed with #
177 191
					}
178 192
					
179 193
					// write forms

Also available in: Unified diff