29 |
29 |
|
30 |
30 |
import java.io.File;
|
31 |
31 |
import java.io.IOException;
|
32 |
|
import java.io.RandomAccessFile;
|
33 |
|
import java.nio.MappedByteBuffer;
|
34 |
|
import java.nio.channels.FileChannel;
|
|
32 |
import java.io.PrintWriter;
|
|
33 |
import java.util.ArrayList;
|
|
34 |
import java.util.HashMap;
|
35 |
35 |
|
36 |
|
import org.apache.log4j.BasicConfigurator;
|
|
36 |
import org.apache.commons.lang.StringUtils;
|
37 |
37 |
import org.eclipse.core.commands.AbstractHandler;
|
38 |
38 |
import org.eclipse.core.commands.ExecutionEvent;
|
39 |
39 |
import org.eclipse.core.commands.ExecutionException;
|
40 |
|
import org.eclipse.jface.dialogs.MessageDialog;
|
41 |
40 |
import org.eclipse.jface.viewers.IStructuredSelection;
|
42 |
|
import org.eclipse.swt.SWT;
|
43 |
|
import org.eclipse.swt.widgets.DirectoryDialog;
|
44 |
41 |
import org.eclipse.ui.handlers.HandlerUtil;
|
45 |
|
import org.txm.searchengine.cqp.AbstractCqiClient;
|
|
42 |
import org.kohsuke.args4j.Option;
|
|
43 |
import org.txm.rcp.swt.widget.parameters.ParametersDialog;
|
46 |
44 |
import org.txm.searchengine.cqp.CQPSearchEngine;
|
47 |
45 |
import org.txm.searchengine.cqp.clientExceptions.CqiClientException;
|
|
46 |
import org.txm.searchengine.cqp.clientExceptions.InvalidCqpIdException;
|
48 |
47 |
import org.txm.searchengine.cqp.clientExceptions.UnexpectedAnswerException;
|
49 |
48 |
import org.txm.searchengine.cqp.corpus.CQPCorpus;
|
50 |
49 |
import org.txm.searchengine.cqp.corpus.MainCorpus;
|
|
50 |
import org.txm.searchengine.cqp.corpus.WordProperty;
|
51 |
51 |
import org.txm.searchengine.cqp.serverException.CqiServerError;
|
52 |
|
import org.txm.searchengine.ts.InternalCorpusQueryManagerLocal2;
|
53 |
|
import org.txm.searchengine.ts.TSCorpus;
|
54 |
|
import org.txm.searchengine.ts.TSCorpusManager;
|
55 |
|
import org.txm.utils.ConsoleProgressBar;
|
56 |
|
import org.txm.utils.DeleteDir;
|
57 |
|
import org.txm.utils.io.FileCopy;
|
58 |
52 |
import org.txm.utils.io.IOUtils;
|
59 |
53 |
import org.txm.utils.logger.Log;
|
60 |
54 |
|
61 |
|
import ims.tiger.corpus.Sentence;
|
62 |
|
import ims.tiger.corpus.T_Node;
|
63 |
|
import ims.tiger.index.reader.Index;
|
64 |
55 |
import ims.tiger.index.reader.IndexException;
|
65 |
|
import ims.tiger.index.writer.IndexBuilderErrorHandler;
|
66 |
|
import ims.tiger.index.writer.SimpleErrorHandler;
|
67 |
|
import ims.tiger.index.writer.XMLIndexing;
|
68 |
56 |
import ims.tiger.query.api.QueryIndexException;
|
69 |
|
import ims.tiger.query.processor.CorpusQueryProcessor;
|
70 |
57 |
|
71 |
58 |
/**
|
72 |
|
* Import TIGERSearch annotations into a TXM corpus
|
|
59 |
* Export the connlu properties and CQP words into a connlu corpus of several files (one per text)
|
73 |
60 |
*
|
74 |
|
* IF the corpus alreasy wontains TIGER annotations, they are replaced
|
75 |
|
*
|
76 |
|
* The annotations are given using a TIGERSEarch binary corpus OR a TIGER source directory using a "main.xml" file
|
77 |
|
*
|
78 |
61 |
* @author mdecorde.
|
79 |
62 |
*/
|
80 |
63 |
public class ExportCorpusAsCONNLU extends AbstractHandler {
|
81 |
64 |
|
82 |
|
public static final String ID = "org.txm.rcp.commands.function.ComputeTSIndex"; //$NON-NLS-1$
|
|
65 |
public static final String ID = ExportCorpusAsCONNLU.class.getName();
|
83 |
66 |
|
|
67 |
@Option(name = "connluResultDirectory", usage = "connluResultDirectory", widget = "Folder", required = true, def = "connlu-result-directory")
|
|
68 |
File connluResultDirectory;
|
|
69 |
|
|
70 |
@Option(name = "propertiesPrefix", usage = "optional prefix for the properties to create", widget = "String", required = true, def = "ud")
|
|
71 |
String propertiesPrefix;
|
|
72 |
|
|
73 |
// @Option(name = "useUDForms", usage = "use the ud form property instead of CQP 'word' property", widget = "Boolean", required = true, def = "true")
|
|
74 |
Boolean useUDForms = true;
|
|
75 |
|
|
76 |
// @Option(name = "transfertAllWords", usage = "Transfert word not initially in the Connlu corpus", widget = "Boolean", required = true, def = "true")
|
|
77 |
Boolean transfertAllWords = true;
|
|
78 |
|
|
79 |
// @Option(name = "detectGap", usage = "Insert gap comment using the CQP 'gap' property", widget = "Boolean", required = true, def = "true")
|
|
80 |
Boolean detectGap = false;
|
|
81 |
|
|
82 |
@Option(name = "formCorrPropertyName", usage = "optional CQP property to fix the missing 'form' ud property", widget = "String", required = false, def = "")
|
|
83 |
String formCorrPropertyName;
|
|
84 |
|
|
85 |
@Option(name = "lemmaCorrPropertyName", usage = "optional CQP property to fix the missing 'lemma' ud property", widget = "String", required = false, def = "")
|
|
86 |
String lemmaCorrPropertyName;
|
|
87 |
|
|
88 |
@Option(name = "uposCorrPropertyName", usage = "optional CQP property to fix the missing 'upos' ud property", widget = "String", required = false, def = "")
|
|
89 |
String uposCorrPropertyName;
|
|
90 |
|
|
91 |
@Option(name = "xposCorrPropertyName", usage = "optional CQP property to fix the missing 'xpos' ud property", widget = "String", required = false, def = "")
|
|
92 |
String xposCorrPropertyName;
|
|
93 |
|
|
94 |
public static String[] propNames = { "id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc" };
|
|
95 |
|
84 |
96 |
/*
|
85 |
97 |
* (non-Javadoc)
|
86 |
98 |
* @see org.eclipse.core.commands.AbstractHandler#execute(org.eclipse.core.commands.ExecutionEvent)
|
... | ... | |
91 |
103 |
IStructuredSelection selection = (IStructuredSelection) HandlerUtil.getCurrentSelection(event);
|
92 |
104 |
|
93 |
105 |
Object s = selection.getFirstElement();
|
94 |
|
if (s instanceof MainCorpus) {
|
95 |
|
CQPCorpus corpus = (CQPCorpus) s;
|
96 |
|
MainCorpus mainCorpus = corpus.getMainCorpus();
|
97 |
|
|
98 |
|
File tigerCorpusDirectory = null;
|
99 |
|
DirectoryDialog dialog = new DirectoryDialog(HandlerUtil.getActiveShell(event), SWT.OPEN);
|
100 |
|
String path = dialog.open();
|
101 |
|
if (path == null) {
|
102 |
|
Log.warning("Aborting annotation importation.");
|
103 |
|
return null;
|
104 |
|
}
|
105 |
|
else {
|
106 |
|
tigerCorpusDirectory = new File(path);
|
107 |
|
}
|
108 |
|
|
109 |
|
File tigerDirectory = new File(mainCorpus.getProjectDirectory(), "tiger");
|
110 |
|
File tigerCorpusExistingDirectory = new File(tigerDirectory, tigerCorpusDirectory.getName());
|
111 |
|
if (tigerCorpusExistingDirectory.exists()) {
|
112 |
|
boolean doIt = MessageDialog.openConfirm(HandlerUtil.getActiveShell(event), "Replace existing annotations", "TIGERSearch annotations already exists, replace them ?");
|
113 |
|
if (!doIt) {
|
114 |
|
Log.warning("Aborting annotation importation.");
|
115 |
|
return null;
|
116 |
|
}
|
117 |
|
}
|
118 |
|
|
119 |
|
if (new File(tigerCorpusDirectory, "word.lexicon").exists() && new File(tigerCorpusDirectory, "corpus_config.xml").exists()) {
|
120 |
|
// ok this is a TIGERSearch binary corpus
|
121 |
|
}
|
122 |
|
else {
|
123 |
|
|
124 |
|
// need to build a TIGERSearch binary corpus
|
125 |
|
File tigerBinaryCorpusDirectory = new File(tigerCorpusDirectory, "tiger");
|
126 |
|
if (!buildTIGERCorpus(mainCorpus, tigerCorpusDirectory, tigerBinaryCorpusDirectory)) {
|
127 |
|
Log.warning("Aborting annotation importation.");
|
128 |
|
return null;
|
129 |
|
}
|
130 |
|
tigerCorpusDirectory = new File(tigerBinaryCorpusDirectory, corpus.getName());
|
131 |
|
}
|
132 |
|
|
133 |
|
try {
|
134 |
|
return importAnnotations(mainCorpus, tigerCorpusDirectory, "editionId");
|
135 |
|
}
|
136 |
|
catch (Exception e) {
|
137 |
|
e.printStackTrace();
|
138 |
|
return null;
|
139 |
|
}
|
140 |
|
}
|
141 |
|
else {
|
|
106 |
if (!(s instanceof MainCorpus)) {
|
142 |
107 |
Log.warning("Selection is not a corpus. Aborting.");
|
143 |
108 |
return null;
|
144 |
109 |
}
|
145 |
|
}
|
146 |
|
|
147 |
|
private boolean buildTIGERCorpus(MainCorpus corpus, File sourceDirectory, File tigerDir) {
|
148 |
|
tigerDir.mkdirs();
|
149 |
110 |
|
150 |
|
File configfile = new File(tigerDir, "tigersearch.logprop");
|
151 |
|
if (!configfile.exists()) {
|
152 |
|
TSCorpus.createLogPropFile(tigerDir);
|
|
111 |
ParametersDialog.open(this);
|
|
112 |
connluResultDirectory.mkdirs();
|
|
113 |
if (connluResultDirectory == null || !connluResultDirectory.exists() || !connluResultDirectory.isDirectory()) {
|
|
114 |
Log.warning("Error: connlu result directory does not exists: " + connluResultDirectory);
|
|
115 |
return null;
|
153 |
116 |
}
|
154 |
117 |
|
155 |
|
BasicConfigurator.configure();
|
156 |
|
File master = new File(sourceDirectory, "main.xml");
|
157 |
|
if (!master.exists()) master = new File(sourceDirectory, "master.xml");
|
|
118 |
CQPCorpus corpus = (CQPCorpus) s;
|
|
119 |
MainCorpus mainCorpus = corpus.getMainCorpus();
|
158 |
120 |
|
159 |
|
if (!master.exists()) {
|
160 |
|
Log.warning("Error: Can't create TIGERSearch corpus: no main or master file found in " + sourceDirectory);
|
161 |
|
return false;
|
162 |
|
}
|
163 |
|
String uri = master.getAbsolutePath(); // TIGER corpus source root file
|
164 |
|
File tigerBinDir = new File(tigerDir, corpus.getName());
|
165 |
|
tigerBinDir.mkdirs();
|
166 |
121 |
try {
|
167 |
|
IndexBuilderErrorHandler handler = new SimpleErrorHandler(tigerBinDir.getAbsolutePath()) {
|
168 |
|
|
169 |
|
@Override
|
170 |
|
public void setMessage(String message) {}
|
171 |
|
|
172 |
|
@Override
|
173 |
|
public void setNumberOfSentences(int number) {}
|
174 |
|
|
175 |
|
@Override
|
176 |
|
public void setProgressBar(int value) {}
|
177 |
|
};
|
178 |
|
|
179 |
|
XMLIndexing indexing = new XMLIndexing(corpus.getName(), uri, tigerBinDir.getAbsolutePath(), handler, false);
|
180 |
|
|
181 |
|
indexing.startIndexing();
|
182 |
|
|
183 |
|
File logs = new File(tigerBinDir, "indexing.log");
|
184 |
|
|
185 |
|
String txt = IOUtils.getText(logs);
|
186 |
|
if (txt.contains("Error in corpus graph ")) {
|
187 |
|
Log.warning("Error while importing TIGER corpus: " + txt);
|
188 |
|
return false;
|
189 |
|
}
|
|
122 |
return exportAnnotationsAsCorpus(mainCorpus, connluResultDirectory, propertiesPrefix, formCorrPropertyName, lemmaCorrPropertyName, uposCorrPropertyName, xposCorrPropertyName, detectGap);
|
190 |
123 |
}
|
191 |
124 |
catch (Exception e) {
|
192 |
|
System.out.println(e.getMessage());
|
193 |
|
return false;
|
|
125 |
Log.warning(e);
|
|
126 |
e.printStackTrace();
|
194 |
127 |
}
|
195 |
|
return true;
|
|
128 |
|
|
129 |
return null;
|
196 |
130 |
}
|
197 |
131 |
|
198 |
132 |
/**
|
... | ... | |
208 |
142 |
* @throws CqiServerError
|
209 |
143 |
* @throws IOException
|
210 |
144 |
* @throws UnexpectedAnswerException
|
|
145 |
* @throws InvalidCqpIdException
|
211 |
146 |
*/
|
212 |
|
public static int importAnnotations(MainCorpus corpus, File tigerCorpusDirectory, String wordIdAttribute) throws IndexException, QueryIndexException, UnexpectedAnswerException, IOException,
|
|
147 |
public static int exportAnnotationsAsCorpus(MainCorpus mainCorpus, File conlluResultDirectory, String prefix, String formCorrPropertyName, String lemmaCorrPropertyName,
|
|
148 |
String uposCorrPropertyName, String xposCorrPropertyName, boolean detectGap) throws UnexpectedAnswerException,
|
|
149 |
IOException,
|
213 |
150 |
CqiServerError,
|
214 |
|
CqiClientException {
|
|
151 |
CqiClientException, InvalidCqpIdException {
|
215 |
152 |
|
216 |
|
// TXM corpus files
|
217 |
|
File tigerDirectory = new File(corpus.getProjectDirectory(), "tiger");
|
218 |
|
File tigerCorpusExistingDirectory = new File(tigerDirectory, corpus.getName());
|
219 |
|
DeleteDir.deleteDirectory(tigerCorpusExistingDirectory);
|
220 |
|
tigerCorpusExistingDirectory.mkdirs();
|
221 |
|
|
222 |
|
File configfile = new File(tigerDirectory, "tigersearch.logprop");
|
223 |
|
if (!configfile.exists()) {
|
224 |
|
TSCorpus.createLogPropFile(tigerDirectory);
|
|
153 |
if (!conlluResultDirectory.exists()) {
|
|
154 |
conlluResultDirectory.mkdirs();
|
225 |
155 |
}
|
|
156 |
int numberOfWordsWritten = 0;
|
|
157 |
int numberOfSentencesWritten = 0;
|
|
158 |
int numberOfTextsWritten = 0;
|
226 |
159 |
|
227 |
|
AbstractCqiClient CQI = CQPSearchEngine.getCqiClient();
|
|
160 |
String[] textIds = mainCorpus.getCorpusTextIdsList();
|
|
161 |
int[] start_limits = mainCorpus.getTextStartLimits();
|
|
162 |
int[] end_limits = mainCorpus.getTextEndLimits();
|
228 |
163 |
|
229 |
|
TSCorpusManager manager = new TSCorpusManager(tigerCorpusDirectory.getParentFile(), configfile);
|
230 |
|
|
231 |
|
TSCorpus tcorpus = manager.getCorpus(tigerCorpusDirectory.getName());
|
232 |
|
InternalCorpusQueryManagerLocal2 tigermanager = tcorpus.manager;
|
233 |
|
CorpusQueryProcessor processor = tigermanager.getQueryProcessor();
|
234 |
|
|
235 |
|
Index index = processor.getIndex();
|
236 |
|
int size = 0;
|
237 |
|
for (int nr = 0; nr < index.getNumberOfGraphs(); nr++) {
|
238 |
|
size += index.getNumberOfTNodes(nr);
|
239 |
|
}
|
240 |
|
|
241 |
|
if (size == 0) {
|
242 |
|
Log.warning("No word found in the TIGERSearch corpus: " + tigerCorpusDirectory + ". Aborting.");
|
243 |
|
return 0;
|
244 |
|
}
|
245 |
|
|
246 |
|
Log.info("Importing " + size + " word annotations...");
|
247 |
|
|
248 |
|
// compute start position of sentences
|
249 |
|
int[] starts = new int[index.getNumberOfGraphs()];
|
250 |
|
for (int i = 0; i < index.getNumberOfGraphs(); i++) {
|
251 |
|
starts[i] = 0;
|
252 |
|
if (i > 0) {
|
253 |
|
starts[i] += index.getNumberOfTNodes(i - 1) + starts[i - 1];
|
|
164 |
for (String p : propNames) {
|
|
165 |
WordProperty wp = mainCorpus.getProperty(prefix + p);
|
|
166 |
if (wp == null) {
|
|
167 |
Log.warning("Error: cannot find the Conllu property: " + prefix + p);
|
|
168 |
return 0;
|
254 |
169 |
}
|
255 |
170 |
}
|
256 |
171 |
|
257 |
|
File offsetsFile = new File(tigerCorpusExistingDirectory, "offsets.data");
|
258 |
|
RandomAccessFile offsetsRAFile = new RandomAccessFile(offsetsFile, "rw");
|
259 |
|
FileChannel offsetsFileChannel = offsetsRAFile.getChannel();
|
260 |
|
MappedByteBuffer offsetsMapped = offsetsFileChannel.map(FileChannel.MapMode.READ_WRITE, 0, size * Integer.BYTES);
|
261 |
|
// out.putInt(positions[i])
|
262 |
|
|
263 |
|
File presencesFile = new File(tigerCorpusExistingDirectory, "presences.data");
|
264 |
|
RandomAccessFile presencesRAFile = new RandomAccessFile(presencesFile, "rw");
|
265 |
|
FileChannel presencesFileChannel = presencesRAFile.getChannel();
|
266 |
|
MappedByteBuffer presencesMapped = presencesFileChannel.map(FileChannel.MapMode.READ_WRITE, 0, size);
|
267 |
|
|
268 |
|
int numberOfWordsAnnotated = 0;
|
269 |
|
|
270 |
|
// for each sentence
|
271 |
|
ConsoleProgressBar cpb = new ConsoleProgressBar(index.getNumberOfGraphs());
|
272 |
|
for (int nr = 0; nr < index.getNumberOfGraphs(); nr++) {
|
273 |
|
cpb.tick();
|
274 |
|
int sent_size = index.getNumberOfTNodes(nr);
|
275 |
|
Sentence sent = tcorpus.manager.getSentence(nr);
|
|
172 |
for (int iText = 0; iText < start_limits.length; iText++) {
|
276 |
173 |
|
277 |
|
String[] ids = new String[sent_size];
|
278 |
|
int[] tigerPositions = new int[sent_size];
|
279 |
|
for (int t = 0; t < sent_size; t++) {
|
280 |
|
T_Node terminal = (T_Node) sent.getTerminalAt(t);
|
281 |
|
ids[t] = terminal.getFeature(wordIdAttribute);
|
282 |
|
|
283 |
|
// try fixing ID
|
284 |
|
if (ids[t].startsWith("w")) {
|
285 |
|
if (!ids[t].startsWith("w_")) {
|
286 |
|
ids[t] = "w_" + ids[t].substring(1);
|
|
174 |
// Build corpus positions
|
|
175 |
int[] positions = new int[end_limits[iText] - start_limits[iText] + 1];
|
|
176 |
int tmp = 0;
|
|
177 |
for (int n = start_limits[iText]; n <= end_limits[iText]; n++) {
|
|
178 |
positions[tmp++] = n;
|
|
179 |
}
|
|
180 |
numberOfWordsWritten += positions.length;
|
|
181 |
|
|
182 |
// Get UD properties
|
|
183 |
WordProperty wp;
|
|
184 |
wp = mainCorpus.getProperty(prefix + "id");
|
|
185 |
String[] idValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions);
|
|
186 |
|
|
187 |
WordProperty formWordProperty = mainCorpus.getProperty(prefix + "form");
|
|
188 |
String[] formValues = CQPSearchEngine.getCqiClient().cpos2Str(formWordProperty.getQualifiedName(), positions);
|
|
189 |
|
|
190 |
wp = mainCorpus.getProperty(prefix + "lemma");
|
|
191 |
String[] lemmaValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions);
|
|
192 |
|
|
193 |
wp = mainCorpus.getProperty(prefix + "upos");
|
|
194 |
String[] uposValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions);
|
|
195 |
|
|
196 |
wp = mainCorpus.getProperty(prefix + "xpos");
|
|
197 |
String[] xposValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions);
|
|
198 |
|
|
199 |
wp = mainCorpus.getProperty(prefix + "feats");
|
|
200 |
String[] featsValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions);
|
|
201 |
|
|
202 |
wp = mainCorpus.getProperty(prefix + "head");
|
|
203 |
String[] headValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions);
|
|
204 |
|
|
205 |
wp = mainCorpus.getProperty(prefix + "deprel");
|
|
206 |
String[] deprelValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions);
|
|
207 |
|
|
208 |
wp = mainCorpus.getProperty(prefix + "deps");
|
|
209 |
String[] depsValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions);
|
|
210 |
|
|
211 |
wp = mainCorpus.getProperty(prefix + "misc");
|
|
212 |
String[] miscValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions);
|
|
213 |
// print UD sentences
|
|
214 |
|
|
215 |
|
|
216 |
// build sentence, first pass using UD word sentence positions
|
|
217 |
ArrayList<ArrayList<Integer>> sentences = new ArrayList<>();
|
|
218 |
ArrayList<Integer> tmpSentence = new ArrayList<>();
|
|
219 |
for (int p = 0; p < positions.length; p++) {
|
|
220 |
// System.out.println("p=" + p + " id=" + idValues[p] + " form=" + formValues[p] + " lemma=" + lemmaValues[p] + " upos=" + uposValues[p] + " xpos=" + xposValues[p] + " feats="
|
|
221 |
// + featsValues[p] + " head="
|
|
222 |
// + headValues[p] + " deprel=" + deprelValues[p] + " deps=" + depsValues[p] + " misc=" + miscValues[p]);
|
|
223 |
if (idValues[p].equals("1")) {
|
|
224 |
|
|
225 |
if (tmpSentence.size() > 0) {
|
|
226 |
sentences.add(new ArrayList<>(tmpSentence));
|
287 |
227 |
}
|
|
228 |
|
|
229 |
// System.out.println("new sentence: " + " id=" + idValues[p] + " form=" + formValues[p] + " lemma=" + lemmaValues[p] + " upos=" + uposValues[p] + " xpos=" + xposValues[p] + "
|
|
230 |
// feats="
|
|
231 |
// + featsValues[p] + " head="
|
|
232 |
// + headValues[p] + " deprel=" + deprelValues[p] + " deps=" + depsValues[p] + " misc=" + miscValues[p]);
|
|
233 |
tmpSentence.clear();
|
288 |
234 |
}
|
289 |
|
else {
|
290 |
|
ids[t] = "w_" + ids[t];
|
291 |
|
}
|
292 |
|
tigerPositions[t] = starts[nr] + t;
|
293 |
|
// System.out.println("T id="+terminal.getID());
|
|
235 |
|
|
236 |
tmpSentence.add(p);
|
294 |
237 |
}
|
|
238 |
positions = null; // free memory
|
295 |
239 |
|
296 |
|
int[] ids_idx = CQI.str2Id(corpus.getProperty("id").getQualifiedName(), ids);
|
297 |
|
Integer[] cqpPositions = new Integer[sent_size];
|
298 |
|
Integer[] offsets = new Integer[sent_size];
|
299 |
|
for (int t = 0; t < sent_size; t++) {
|
300 |
|
if (ids_idx[t] >= 0) {
|
301 |
|
int[] positions = CQI.id2Cpos(corpus.getProperty("id").getQualifiedName(), ids_idx[t]);
|
302 |
|
if (positions.length > 1) {
|
303 |
|
Log.warning("Warning: multiple CQP positions for word_id=" + ids[t]);
|
304 |
|
}
|
305 |
|
cqpPositions[t] = positions[0]; // take the first position
|
|
240 |
// fixing sentences
|
|
241 |
for (int s = 0; s < sentences.size(); s++) {
|
|
242 |
// int c = 0;
|
|
243 |
// ArrayList<Integer> sentence = sentences.get(s);
|
|
244 |
// for (int ip = 0 ; ip < sentence.size() ; ip++) {
|
|
245 |
//
|
|
246 |
// int p = sentence.get(ip);
|
|
247 |
//
|
|
248 |
// if (idValues[p].equals("__UNDEF__")) {
|
|
249 |
// c++;
|
|
250 |
// }
|
|
251 |
// }
|
|
252 |
// if (c == 0) { // al is fine
|
|
253 |
//
|
|
254 |
// } else if (c )
|
|
255 |
}
|
|
256 |
|
|
257 |
if (tmpSentence.size() > 0) { // add last sentence
|
|
258 |
sentences.add(new ArrayList<>(tmpSentence));
|
|
259 |
}
|
|
260 |
|
|
261 |
// fixing sentence __NULL__ ud properties
|
|
262 |
for (int iSentence = 0; iSentence < sentences.size(); iSentence++) {
|
|
263 |
ArrayList<Integer> sentence = sentences.get(iSentence);
|
|
264 |
|
|
265 |
int[] sentencePositions = new int[sentence.size()];
|
|
266 |
for (int p = 0; p < sentence.size(); p++)
|
|
267 |
sentencePositions[p] = sentence.get(p);
|
|
268 |
|
|
269 |
// get CQP values fixing "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps"
|
|
270 |
String[] ids = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty("id").getQualifiedName(), sentencePositions);
|
|
271 |
|
|
272 |
String[] words = null;
|
|
273 |
if (formCorrPropertyName != null && formCorrPropertyName.length() > 0) {
|
|
274 |
words = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(formCorrPropertyName).getQualifiedName(), sentencePositions);
|
306 |
275 |
}
|
307 |
|
else { // word not in the CQP corpus
|
308 |
|
Log.warning("Could not find word for id=" + ids[t]);
|
309 |
|
cqpPositions[t] = null;
|
|
276 |
String[] lemmas = null;
|
|
277 |
if (lemmaCorrPropertyName != null && lemmaCorrPropertyName.length() > 0) {
|
|
278 |
lemmas = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(lemmaCorrPropertyName).getQualifiedName(), sentencePositions);
|
310 |
279 |
}
|
|
280 |
String[] upos = null;
|
|
281 |
if (uposCorrPropertyName != null && uposCorrPropertyName.length() > 0) {
|
|
282 |
upos = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(uposCorrPropertyName).getQualifiedName(), sentencePositions);
|
|
283 |
}
|
|
284 |
String[] xpos = null;
|
|
285 |
if (xposCorrPropertyName != null && xposCorrPropertyName.length() > 0) {
|
|
286 |
xpos = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(xposCorrPropertyName).getQualifiedName(), sentencePositions);
|
|
287 |
}
|
311 |
288 |
|
312 |
|
if (cqpPositions[t] != null) {
|
313 |
|
offsets[t] = cqpPositions[t] - tigerPositions[t];
|
|
289 |
// String[] feats = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(featsCorrPropertyName).getQualifiedName(), sentencePositions);
|
|
290 |
// String[] head = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(headCorrPropertyName).getQualifiedName(), sentencePositions);
|
|
291 |
// String[] deprel = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(deprelCorrPropertyName).getQualifiedName(), sentencePositions);
|
|
292 |
// String[] deps = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(deprelCorrPropertyName).getQualifiedName(), sentencePositions);
|
|
293 |
|
|
294 |
HashMap<String, String> sentIds = new HashMap<>();
|
|
295 |
for (int ip = 0; ip < sentence.size(); ip++) {
|
|
296 |
int p = sentence.get(ip);
|
|
297 |
|
|
298 |
if (!idValues[p].equals("__UNDEF__")) { // store "old id -> new id"
|
|
299 |
sentIds.put(idValues[p], "" + (ip + 1)); // from 1 to N
|
|
300 |
}
|
|
301 |
|
|
302 |
// new word
|
|
303 |
if (miscValues[p].equals("__UNDEF__")) {
|
|
304 |
miscValues[p] = "XmlId=" + ids[ip];
|
|
305 |
}
|
|
306 |
|
|
307 |
// { "id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc" };
|
|
308 |
if (words != null && formValues[p].equals("__UNDEF__")) {
|
|
309 |
formValues[p] = words[ip];
|
|
310 |
}
|
|
311 |
if (lemmas != null && lemmaValues[p].equals("__UNDEF__")) {
|
|
312 |
lemmaValues[p] = lemmas[ip];
|
|
313 |
}
|
|
314 |
if (upos != null && uposValues[p].equals("__UNDEF__")) {
|
|
315 |
uposValues[p] = upos[ip];
|
|
316 |
}
|
|
317 |
if (xpos != null && xposValues[p].equals("__UNDEF__")) {
|
|
318 |
xposValues[p] = xpos[ip];
|
|
319 |
}
|
314 |
320 |
}
|
315 |
|
else {
|
316 |
|
offsets[t] = null;
|
|
321 |
|
|
322 |
// fixing head and set missing head to 0 and root
|
|
323 |
for (int ip = 0; ip < sentence.size(); ip++) {
|
|
324 |
int p = sentence.get(ip);
|
|
325 |
|
|
326 |
|
|
327 |
|
|
328 |
// fixing id value
|
|
329 |
idValues[p] = "" + (ip + 1); // from 1 to N
|
|
330 |
|
|
331 |
// fixing head values
|
|
332 |
if (sentIds.containsKey(headValues[p])) {
|
|
333 |
headValues[p] = sentIds.get(headValues[p]);
|
|
334 |
}
|
|
335 |
else { // new word, set to default values
|
|
336 |
headValues[p] = "0";
|
|
337 |
deprelValues[p] = "root";
|
|
338 |
depsValues[p] = "_";
|
|
339 |
}
|
317 |
340 |
}
|
318 |
341 |
}
|
319 |
|
// System.out.println("ids="+Arrays.toString(ids));
|
320 |
|
// System.out.println("cqp indexes="+Arrays.toString(ids_idx));
|
321 |
|
// System.out.println("tiger positions="+Arrays.toString(tigerPositions));
|
322 |
|
// System.out.println("cqp positions="+Arrays.toString(cqpPositions));
|
323 |
|
// System.out.println("offsets="+Arrays.toString(offsets));
|
324 |
342 |
|
325 |
|
// writing data to offset and presences files
|
326 |
|
for (int t = 0; t < sent_size; t++) {
|
|
343 |
// writing sentences
|
|
344 |
File resultConlluFile = new File(conlluResultDirectory, textIds[iText] + ".conllu");
|
|
345 |
PrintWriter writer = IOUtils.getWriter(resultConlluFile);
|
|
346 |
for (int iSentence = 0; iSentence < sentences.size(); iSentence++) {
|
|
347 |
ArrayList<Integer> sentence = sentences.get(iSentence);
|
327 |
348 |
|
328 |
|
if (offsets[t] != null) {
|
329 |
|
numberOfWordsAnnotated++;
|
330 |
|
presencesMapped.put((byte) 1);
|
331 |
|
offsetsMapped.putInt(offsets[t]);
|
|
349 |
int[] sentencePositions = new int[sentence.size()];
|
|
350 |
for (int p = 0; p < sentence.size(); p++)
|
|
351 |
sentencePositions[p] = sentence.get(p);
|
|
352 |
|
|
353 |
String[] gap = null;
|
|
354 |
if (detectGap && mainCorpus.getProperty("gap") != null) {
|
|
355 |
gap = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty("gap").getQualifiedName(), sentencePositions);
|
332 |
356 |
}
|
333 |
|
else {
|
334 |
|
presencesMapped.put((byte) 0);
|
335 |
|
offsetsMapped.putInt(0);
|
|
357 |
|
|
358 |
String[] tokens = new String[sentences.size()];
|
|
359 |
for (int ip = 0; ip < sentence.size(); ip++) {
|
|
360 |
tokens[ip] = formValues[sentence.get(ip)];
|
336 |
361 |
}
|
|
362 |
|
|
363 |
writer.println("# text = " + StringUtils.join(tokens, " "));
|
|
364 |
writer.println("# newdoc id = " + textIds[iText]);
|
|
365 |
writer.println("# sent_id = " + (iSentence + 1));
|
|
366 |
|
|
367 |
for (int p : sentence) {
|
|
368 |
|
|
369 |
if (gap != null && gap[p] != null) writer.println("# gap");
|
|
370 |
|
|
371 |
// { "id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc" };
|
|
372 |
writer.println(idValues[p] + "\t" + formValues[p] + "\t" + lemmaValues[p] + "\t" + uposValues[p]
|
|
373 |
+ "\t" + xposValues[p] + "\t" + featsValues[p] + "\t" + headValues[p] + "\t" + deprelValues[p]
|
|
374 |
+ "\t" + depsValues[p] + "\t" + miscValues[p]);
|
|
375 |
}
|
|
376 |
writer.println("");
|
|
377 |
numberOfSentencesWritten++;
|
337 |
378 |
}
|
|
379 |
writer.close();
|
|
380 |
|
|
381 |
numberOfTextsWritten++;
|
338 |
382 |
}
|
339 |
|
cpb.done();
|
340 |
383 |
|
341 |
|
offsetsFileChannel.close();
|
342 |
|
offsetsRAFile.close();
|
343 |
|
presencesFileChannel.close();
|
344 |
|
presencesRAFile.close();
|
|
384 |
System.out.println("N words written: " + numberOfWordsWritten);
|
|
385 |
System.out.println("N sentences written: " + numberOfSentencesWritten);
|
|
386 |
System.out.println("N texts written: " + numberOfTextsWritten);
|
345 |
387 |
|
346 |
|
Log.info("Finalizing TIGERSearch corpus");
|
347 |
|
if (numberOfWordsAnnotated > 0) {
|
348 |
|
FileCopy.copyFiles(tigerCorpusDirectory, tigerCorpusExistingDirectory);
|
349 |
|
Log.info("Done. " + numberOfWordsAnnotated + " words annotated.");
|
350 |
|
}
|
351 |
|
else {
|
352 |
|
Log.warning("Warning: no words could be aligned with the CQP corpus. Aborting");
|
353 |
|
}
|
354 |
|
|
355 |
|
return numberOfWordsAnnotated;
|
|
388 |
return numberOfWordsWritten;
|
356 |
389 |
}
|
357 |
390 |
}
|