Révision 2987

tmp/org.txm.treetagger.core/src/org/txm/treetagger/core/TreeTaggerEngine.java (revision 2987)
8 8
import org.txm.annotation.core.AnnotationEngine;
9 9
import org.txm.core.results.TXMResult;
10 10
import org.txm.importer.xmltxm.Annotate;
11
import org.txm.importer.xmltxm.AnnotateCQP;
11 12
import org.txm.objects.CorpusBuild;
12 13
import org.txm.treetagger.core.preferences.TreeTaggerPreferences;
14
import org.txm.utils.FileUtils;
13 15
import org.txm.utils.logger.Log;
14 16
import org.txm.utils.treetagger.TreeTagger;
15 17

  
......
80 82
	public boolean processFile(File xmlFile, File binaryCorpusDirectory, HashMap<String, Object> parameters) {
81 83
		if (!isRunning()) return false;
82 84
		
83
		Annotate annotate = new Annotate();
85
		
84 86
		Object p = parameters.get("lang");
85 87
		if (p == null) {
86 88
			System.out.println(NLS.bind("** Error: no 'lang' parameter given: {0}. Aborting TreeTagger annotation.", parameters));
......
90 92
		if (!canAnnotateLang(lang)) {
91 93
			return false;
92 94
		}
93
		return annotate.run(xmlFile, lang, binaryCorpusDirectory, xmlFile.getParentFile());
95
		boolean fixExistingValues = false; // default behavior is to replace existing values
96
		if (parameters.get("fix_existing_values") != null) {
97
			fixExistingValues = "true".equals(parameters.get("fix_existing_values"));
98
		}
99
		if (FileUtils.isExtension(xmlFile, "cqp")) {
100
			AnnotateCQP annotate = new AnnotateCQP();
101
			return annotate.run(xmlFile, lang, binaryCorpusDirectory, xmlFile.getParentFile());
102
		} else {
103
			Annotate annotate = new Annotate();
104
			return annotate.run(xmlFile, lang, fixExistingValues, binaryCorpusDirectory, xmlFile.getParentFile());
105
		}
94 106
	}
95 107
	
96 108
	/**
......
103 115
		Object p = parameters.get("langs");
104 116
		if (p != null && p instanceof HashMap<?, ?>) {
105 117
			
106
			
107 118
			Annotate annotate = new Annotate();
108 119
			HashMap<String, String> langs = (HashMap<String, String>) p;
109 120
			return annotate.run(binaryCorpusDirectory, new File(binaryCorpusDirectory, "txm"), langs);
tmp/org.txm.treetagger.core/src/org/txm/importer/xmltxm/AnnotateWTC.groovy (revision 2987)
1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2013-12-06 16:57:35 +0100 (Fri, 06 Dec 2013) $
25
// $LastChangedRevision: 2583 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.importer.xmltxm
29

  
30
import java.io.File
31
import java.text.DateFormat
32
import java.util.Date
33

  
34
import org.txm.Toolbox
35
import org.txm.importer.*
36
import org.txm.importer.cwb.*
37
import org.txm.objects.*
38
import org.txm.treetagger.core.preferences.TreeTaggerPreferences
39
import org.txm.utils.ConsoleProgressBar
40
import org.txm.utils.DeleteDir;
41
import org.txm.utils.LangDetector;
42
import org.txm.utils.io.IOUtils
43
import org.txm.utils.treetagger.TreeTagger
44

  
45
// TODO: Auto-generated Javadoc
46
/**
47
 * Annotate and replace the TEI-TXM files of the folder $rootDirFile/txm with TreeTagger.
48
 * creates $rootDirFile/interp and $rootDirFile/treetagger
49
 *
50
 */
51
class AnnotateCQP {
52

  
53
	/** The debug. */
54
	boolean debug = true;
55

  
56
	/**
57
	 * Sets the debug.
58
	 */
59
	public void setDebug() { debug=true; }
60

  
61
	String id;
62

  
63
	/**
64
	 * Apply tt.
65
	 *
66
	 * @param ttsrcfile the ttsrcfile
67
	 * @param ttoutfile the ttoutfile
68
	 * @param modelfile the modelfile
69
	 * @return true, if successful
70
	 */
71
	public boolean applyTT(File ttsrcfile, File ttoutfile, File modelfile)
72
	{
73
		try {
74
			File infile = ttsrcfile;
75
			File outfile = ttoutfile;
76

  
77
			// TODO: why org.txm.utils.treetagger.TreeTagger tt = ..., throw IllegalAccessError ???
78
			String opt = Toolbox.getPreference(Toolbox.TREETAGGER_OPTIONS);
79
			if (opt == null) opt = "";
80
			String[] options = opt.split("  ");
81
			def tt = new TreeTagger(Toolbox.getPreference(Toolbox.TREETAGGER_INSTALL_PATH)+"/bin/", options);
82
			tt.settoken();
83
			tt.setlemma();
84
			tt.setquiet();
85
			tt.setsgml();
86
			tt.setnounknown();
87
			tt.seteostag("<s>");
88
			tt.treetagger(modelfile.getAbsolutePath(), infile.getAbsolutePath(), outfile.getAbsolutePath())
89
			//infile.delete();
90
		}
91
		catch(Exception e)
92
		{
93
			e.printStackTrace();
94
			return false;
95
		}
96
		return true;
97
	}
98

  
99
	/**
100
	 * Run.
101
	 *
102
	 * @param rootDirFile the root dir file
103
	 * @param modelfilename the modelfilename
104
	 * @return true, if successful
105
	 */
106
	public boolean run(File binDir, File txmDir,  String modelfilename)
107
	{
108
		//test if modelfile exists
109
		if(debug) {
110
			println "rootDirFile "+binDir
111
			println "txmDir "+txmDir
112
			println "TREETAGGER INSTALL PATH : "+TreeTaggerPreferences.getInstance().getBoolean(TreeTaggerPreferences.INSTALL_PATH)
113
			println "TREETAGGER MODELS PATH : "+TreeTaggerPreferences.getInstance().getBoolean(TreeTaggerPreferences.MODELS_PATH)
114
		}
115

  
116
		//test if the Toolbox know TreeTagger
117
		if (!new File(Toolbox.getPreference(Toolbox.TREETAGGER_INSTALL_PATH)+"/bin/").exists())
118
		{
119
			println("Could not find TreeTagger binaries in "+Toolbox.getPreference(Toolbox.TREETAGGER_INSTALL_PATH)+"/bin/")
120
			return false;
121
		}
122

  
123
		//cleaning
124
		File annotDir = new File(binDir, "annotations")
125
		annotDir.mkdir();
126

  
127
		//BUILD TT FILE READY TO BE TAGGED
128
		List<File> files = txmDir.listFiles(IOUtils.HIDDENFILE_FILTER)
129

  
130
		// get model file and check it
131
		File modelfile = new File(TreeTaggerPreferences.getInstance().getBoolean(TreeTaggerPreferences.MODELS_PATH), modelfilename);
132
		if (debug)
133
			println "model file : "+modelfile;
134

  
135
		if (!modelfile.exists()) {
136
			println "Skipping ANNOTATE: Incorrect modelfile path: "+modelfile;
137
			if(System.getProperty("os.name").startsWith("Windows"))
138
				println "Windows users: Windows might be hiding files extension. To see them, in the explorer parameters."
139
			return false;
140
		}
141

  
142
		//APPLY TREETAGGER
143
		println("Applying $modelfilename TreeTagger model on dir: "+txmDir+ " ("+files.size()+" files)")
144
		if (files == null || files.size() == 0)
145
			return false;
146

  
147
		ConsoleProgressBar cpb = new ConsoleProgressBar(files.size());
148
		for (File f : files) {
149
			cpb.tick();
150
			
151
			File infile = f;
152
			File outfile = new File(new File(binDir, "annotations"), f.getName());
153
			if (outfile.exists() && // outfile exists
154
				outfile.lastModified() >= infile.lastModified() && // outfile is more recent 
155
				outfile.length() > infile.length()) { // outfile is bigger
156
				// skip
157
			} else {
158
				if (!applyTT(infile, outfile, modelfile)) {
159
					System.out.println("Failed to apply treetagger on file "+f);
160
					return false;
161
				}
162
			}
163
		}
164
		println("")
165

  
166
//		if (DeleteDir.deleteDirectory(txmDir)) {
167
//			if (annotDir.renameTo(txmDir)) {
168
//
169
//			} else {
170
//				println "Could not rename  'annotations' directory to 'txm' directory"
171
//				return false
172
//			}
173
//		} else {
174
//			println "Could not delete txmDir: $txmDir (and could not copy 'annotations' directory"
175
//		}
176

  
177
		return true;
178
	}
179
}
tmp/org.txm.treetagger.core/src/org/txm/importer/xmltxm/Annotate.groovy (revision 2987)
546 546
		return true;
547 547
	}
548 548
	
549
	public boolean run(File f, String lang, File binDir, File txmDir) {
549
	public boolean run(File f, String lang, boolean fixExistingValues, File binDir, File txmDir) {
550 550
		
551 551
		File annotDir = new File(binDir,"annotations");
552 552
		annotDir.mkdir();
......
555 555
		File treetaggerDir = new File(binDir,"treetagger");
556 556
		treetaggerDir.mkdir();
557 557
		
558
		return run(f, lang, binDir, txmDir, false, new String[0], new String[0], annotDir, ptreetaggerDir, treetaggerDir)
558
		return run(f, lang, binDir, txmDir, fixExistingValues, new String[0], new String[0], annotDir, ptreetaggerDir, treetaggerDir)
559 559
	}
560 560
	
561
	public boolean run(File f, String lang, File binDir, File txmDir, boolean replace, String[] properties, String[] options, File annotDir, File ptreetaggerDir, File treetaggerDir) {
561
	public boolean run(File f, String lang, File binDir, File txmDir, boolean fixExistingValues, String[] properties, String[] options, File annotDir, File ptreetaggerDir, File treetaggerDir) {
562 562
		
563 563
		File modelfile = new File(modelsDirectory, lang+".par");
564 564
		if (!"??".equals(lang) && !modelfile.exists()) {
......
593 593
		
594 594
		//INJECT ANNOTATIONS
595 595
		File tmpFile = new File(txmDir, "temp_"+f.getName())
596
		builder = new AnnotationInjection(f.toURL(), annotfile.toURL(), replace);
596
		builder = new AnnotationInjection(f.toURL(), annotfile.toURL(), fixExistingValues);
597 597
		builder.process(tmpFile);
598 598
		if (!(f.delete() && tmpFile.renameTo(f))) println "Warning can't rename file "+tmpFile+" to "+f
599 599
		
tmp/org.txm.treetagger.core/src/org/txm/importer/xmltxm/AnnotateCQP.groovy (revision 2987)
1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2013-12-06 16:57:35 +0100 (Fri, 06 Dec 2013) $
25
// $LastChangedRevision: 2583 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.importer.xmltxm
29

  
30
import java.io.File
31
import java.text.DateFormat
32
import java.util.Date
33

  
34
import org.txm.Toolbox
35
import org.txm.importer.*
36
import org.txm.importer.cwb.*
37
import org.txm.objects.*
38
import org.txm.treetagger.core.preferences.TreeTaggerPreferences
39
import org.txm.utils.ConsoleProgressBar
40
import org.txm.utils.DeleteDir;
41
import org.txm.utils.LangDetector;
42
import org.txm.utils.io.IOUtils
43
import org.txm.utils.treetagger.TreeTagger
44

  
45
// TODO: Auto-generated Javadoc
46
/**
47
 * Annotate and replace the TEI-TXM files of the folder $rootDirFile/txm with TreeTagger.
48
 * creates $rootDirFile/interp and $rootDirFile/treetagger
49
 *
50
 */
51
class AnnotateCQP {
52

  
53
	/** The debug. */
54
	boolean debug = true;
55

  
56
	/**
57
	 * Sets the debug.
58
	 */
59
	public void setDebug() { debug=true; }
60

  
61
	String id;
62

  
63
	/**
64
	 * Apply tt.
65
	 *
66
	 * @param ttsrcfile the ttsrcfile
67
	 * @param ttoutfile the ttoutfile
68
	 * @param modelfile the modelfile
69
	 * @return true, if successful
70
	 */
71
	public boolean applyTT(File ttsrcfile, File ttoutfile, File modelfile)
72
	{
73
		try {
74
			File infile = ttsrcfile;
75
			File outfile = ttoutfile;
76

  
77
			// TODO: why org.txm.utils.treetagger.TreeTagger tt = ..., throw IllegalAccessError ???
78
			String opt = Toolbox.getPreference(Toolbox.TREETAGGER_OPTIONS);
79
			if (opt == null) opt = "";
80
			String[] options = opt.split("  ");
81
			def tt = new TreeTagger(Toolbox.getPreference(Toolbox.TREETAGGER_INSTALL_PATH)+"/bin/", options);
82
			tt.settoken();
83
			tt.setlemma();
84
			tt.setquiet();
85
			tt.setsgml();
86
			tt.setnounknown();
87
			tt.seteostag("<s>");
88
			tt.treetagger(modelfile.getAbsolutePath(), infile.getAbsolutePath(), outfile.getAbsolutePath())
89
			//infile.delete();
90
		}
91
		catch(Exception e)
92
		{
93
			e.printStackTrace();
94
			return false;
95
		}
96
		return true;
97
	}
98

  
99
	/**
100
	 * Run.
101
	 *
102
	 * @param rootDirFile the root dir file
103
	 * @param modelfilename the modelfilename
104
	 * @return true, if successful
105
	 */
106
	public boolean run(File binDir, File txmDir,  String modelfilename)
107
	{
108
		//test if modelfile exists
109
		if(debug) {
110
			println "rootDirFile "+binDir
111
			println "txmDir "+txmDir
112
			println "TREETAGGER INSTALL PATH : "+TreeTaggerPreferences.getInstance().getBoolean(TreeTaggerPreferences.INSTALL_PATH)
113
			println "TREETAGGER MODELS PATH : "+TreeTaggerPreferences.getInstance().getBoolean(TreeTaggerPreferences.MODELS_PATH)
114
		}
115

  
116
		//test if the Toolbox know TreeTagger
117
		if (!new File(Toolbox.getPreference(Toolbox.TREETAGGER_INSTALL_PATH)+"/bin/").exists())
118
		{
119
			println("Could not find TreeTagger binaries in "+Toolbox.getPreference(Toolbox.TREETAGGER_INSTALL_PATH)+"/bin/")
120
			return false;
121
		}
122

  
123
		//cleaning
124
		File annotDir = new File(binDir, "annotations")
125
		annotDir.mkdir();
126

  
127
		//BUILD TT FILE READY TO BE TAGGED
128
		List<File> files = txmDir.listFiles(IOUtils.HIDDENFILE_FILTER)
129

  
130
		// get model file and check it
131
		File modelfile = new File(TreeTaggerPreferences.getInstance().getBoolean(TreeTaggerPreferences.MODELS_PATH), modelfilename);
132
		if (debug)
133
			println "model file : "+modelfile;
134

  
135
		if (!modelfile.exists()) {
136
			println "Skipping ANNOTATE: Incorrect modelfile path: "+modelfile;
137
			if(System.getProperty("os.name").startsWith("Windows"))
138
				println "Windows users: Windows might be hiding files extension. To see them, in the explorer parameters."
139
			return false;
140
		}
141

  
142
		//APPLY TREETAGGER
143
		println("Applying $modelfilename TreeTagger model on dir: "+txmDir+ " ("+files.size()+" files)")
144
		if (files == null || files.size() == 0)
145
			return false;
146

  
147
		ConsoleProgressBar cpb = new ConsoleProgressBar(files.size());
148
		for (File f : files) {
149
			cpb.tick();
150
			
151
			File infile = f;
152
			File outfile = new File(new File(binDir, "annotations"), f.getName());
153
			if (outfile.exists() && // outfile exists
154
				outfile.lastModified() >= infile.lastModified() && // outfile is more recent 
155
				outfile.length() > infile.length()) { // outfile is bigger
156
				// skip
157
			} else {
158
				if (!applyTT(infile, outfile, modelfile)) {
159
					System.out.println("Failed to apply treetagger on file "+f);
160
					return false;
161
				}
162
			}
163
		}
164
		println("")
165

  
166
//		if (DeleteDir.deleteDirectory(txmDir)) {
167
//			if (annotDir.renameTo(txmDir)) {
168
//
169
//			} else {
170
//				println "Could not rename  'annotations' directory to 'txm' directory"
171
//				return false
172
//			}
173
//		} else {
174
//			println "Could not delete txmDir: $txmDir (and could not copy 'annotations' directory"
175
//		}
176

  
177
		return true;
178
	}
179

  
180
	public boolean run(File xmlFile, String lang, File binaryCorpusDirectory, File parentFile) {
181
		System.out.println("AnnotateCQP called from the TreeTaggerEngine.processFile() is not yet implemented.")
182
		return false;
183
	}
184
}
tmp/org.txm.core/src/java/org/txm/importer/scripts/xmltxm/AnnotationInjection.groovy (revision 2987)
57 57

  
58 58
	/** The links. */
59 59
	private LinkedHashSet<String> links;
60
	boolean replace
60
	/**
61
	 * if set, existing values are replaced
62
	 */
63
	boolean fixExistingValues
61 64

  
62 65
	/** The linkparsers. key=type*/
63 66
	private LinkedHashMap<String, XMLStreamReader> linkparsers;
......
97 100
	 * @param url the xml-tei-txm file
98 101
	 * @param anaurl the stand-off file
99 102
	 */
100
	public AnnotationInjection(URL url, URL anaurl, boolean replace) {
103
	public AnnotationInjection(URL url, URL anaurl, boolean fixExistingValues) {
101 104
		super(url); // init reader and writer
102 105
		try {
103 106
			this.anaurl = anaurl;
104
			this.replace = replace
107
			this.fixExistingValues = fixExistingValues
105 108
			factory = XMLInputFactory.newInstance();
106 109
			this.buildLinkParsers();// build a parser per linkgroup
107 110
		} catch (XMLStreamException ex) {
......
288 291
			writer.writeStartElement(TXMNS, "ana");
289 292
			writer.writeAttribute("resp", "#"+respStmtID);
290 293
			writer.writeAttribute("type", "#"+link);
291
			if (replace) {
292
				if (linkparsers.containsKey(link)) { // order is important
293
					writer.writeCharacters(getNextAnaValue(link, wordId));
294
				} else {
295
					writer.writeCharacters(anaValues.get(link));
296
				}
297
			} else { // ok no problem
298
				writer.writeCharacters(getNextAnaValue(link, wordId));
294
			
295
			String newValue = getNextAnaValue(link, wordId)
296
			if (fixExistingValues || anaValues.containsKey(link) == null) {
297
				anaValues.put(link, newValue)
298
			} else {
299
				anaValues.put(link, "")
299 300
			}
300 301
			
302
			writer.writeCharacters(anaValues.get(link));
303
			
301 304
			writer.writeEndElement(); // txm:ana
302 305
		}
303 306
	}
......
311 314
		if (localname.equals("taxonomy")) {
312 315
			String taxo = parser.getAttributeValue(0) // taxonomy type
313 316
			presentTaxonomies.add(taxo);
314
		} else if (flagW && replace && localname.equals("ana")) {
317
		} else if (flagW && localname.equals("ana")) {
315 318
			flagAna = true
316 319
			anaValue= ""
317 320
			type = null

Formats disponibles : Unified diff