Révision 927

tmp/TXMBetaPlugins.site/site.xml (revision 927)
1 1
<?xml version="1.0" encoding="UTF-8"?>
2 2
<site>
3
   <feature url="features/org.txm.ahc.feature_1.0.0.qualifier.jar" id="org.txm.ahc.feature" version="1.0.0.qualifier"/>
4
   <feature url="features/org.txm.analec.feature_1.0.0.qualifier.jar" id="org.txm.analec.feature" version="1.0.0.qualifier"/>
5
   <feature url="features/org.txm.annotation.feature_1.0.0.qualifier.jar" id="org.txm.annotation.feature" version="1.0.0.qualifier"/>
6
   <feature url="features/org.txm.annotation.kr.feature_1.0.0.qualifier.jar" id="org.txm.annotation.kr.feature" version="1.0.0.qualifier"/>
7
   <feature url="features/org.txm.backtomedia.feature_1.0.0.qualifier.jar" id="org.txm.backtomedia.feature" version="1.0.0.qualifier"/>
8
   <feature url="features/org.txm.ca.feature_1.0.0.qualifier.jar" id="org.txm.ca.feature" version="1.0.0.qualifier"/>
9
   <feature url="features/org.txm.chartsengine.jfreechart.feature_1.0.0.qualifier.jar" id="org.txm.chartsengine.jfreechart.feature" version="1.0.0.qualifier"/>
10
   <feature url="features/org.txm.chartsengine.r.feature_1.0.0.qualifier.jar" id="org.txm.chartsengine.r.feature" version="1.0.0.qualifier"/>
11
   <feature url="features/org.txm.concordance.feature_1.0.0.qualifier.jar" id="org.txm.concordance.feature" version="1.0.0.qualifier"/>
12
   <feature url="features/org.txm.cooccurrence.feature_1.0.0.qualifier.jar" id="org.txm.cooccurrence.feature" version="1.0.0.qualifier"/>
13
   <feature url="features/org.txm.cql2lsa.feature_1.0.0.qualifier.jar" id="org.txm.cql2lsa.feature" version="1.0.0.qualifier"/>
14
   <feature url="features/org.txm.dictionary.feature_1.0.0.qualifier.jar" id="org.txm.dictionary.feature" version="1.0.0.qualifier"/>
15
   <feature url="features/org.txm.edition.feature_1.0.0.qualifier.jar" id="org.txm.edition.feature" version="1.0.0.qualifier"/>
16
   <feature url="features/org.txm.groovy.feature_1.0.0.qualifier.jar" id="org.txm.groovy.feature" version="1.0.0.qualifier"/>
17
   <feature url="features/org.txm.index.feature_1.0.0.qualifier.jar" id="org.txm.index.feature" version="1.0.0.qualifier"/>
18
   <feature url="features/org.txm.internalview.feature_1.0.0.qualifier.jar" id="org.txm.internalview.feature" version="1.0.0.qualifier"/>
19
   <feature url="features/org.txm.lexicaltable.feature_1.0.0.qualifier.jar" id="org.txm.lexicaltable.feature" version="1.0.0.qualifier"/>
20
   <feature url="features/org.txm.partition.feature_1.0.0.qualifier.jar" id="org.txm.partition.feature" version="1.0.0.qualifier"/>
21
   <feature url="features/org.txm.progression.feature_1.0.0.qualifier.jar" id="org.txm.progression.feature" version="1.0.0.qualifier"/>
22
   <feature url="features/org.txm.properties.feature_1.0.0.qualifier.jar" id="org.txm.properties.feature" version="1.0.0.qualifier"/>
23
   <feature url="features/org.txm.rcp.feature_1.0.0.qualifier.jar" id="org.txm.rcp.feature" version="1.0.0.qualifier"/>
24
   <feature url="features/org.txm.referencer.feature_1.0.0.qualifier.jar" id="org.txm.referencer.feature" version="1.0.0.qualifier"/>
25
   <feature url="features/org.txm.searchengine.cqp.feature_1.0.0.qualifier.jar" id="org.txm.searchengine.cqp.feature" version="1.0.0.qualifier"/>
26
   <feature url="features/org.txm.specificities.feature_1.0.0.qualifier.jar" id="org.txm.specificities.feature" version="1.0.0.qualifier"/>
27
   <feature url="features/org.txm.statsengine.r.feature_1.0.0.qualifier.jar" id="org.txm.statsengine.r.feature" version="1.0.0.qualifier"/>
28
   <feature url="features/org.txm.svn.feature_1.0.0.qualifier.jar" id="org.txm.svn.feature" version="1.0.0.qualifier"/>
29
   <feature url="features/org.txm.textbalance.feature_1.0.0.qualifier.jar" id="org.txm.textbalance.feature" version="1.0.0.qualifier"/>
30
   <feature url="features/org.txm.tigersearch.feature_1.0.0.qualifier.jar" id="org.txm.tigersearch.feature" version="1.0.0.qualifier"/>
31
   <feature url="features/org.txm.treetagger.feature_1.0.0.qualifier.jar" id="org.txm.treetagger.feature" version="1.0.0.qualifier"/>
32
   <feature url="features/org.txm.treetagger.files.feature_1.0.0.qualifier.jar" id="org.txm.treetagger.files.feature" version="1.0.0.qualifier"/>
33
   <feature url="features/org.txm.wordcloud.feature_1.0.0.qualifier.jar" id="org.txm.wordcloud.feature" version="1.0.0.qualifier"/>
3
   <feature url="features/org.txm.ahc.feature_1.0.0.201806251537.jar" id="org.txm.ahc.feature" version="1.0.0.201806251537"/>
4
   <feature url="features/org.txm.analec.feature_1.0.0.201806251537.jar" id="org.txm.analec.feature" version="1.0.0.201806251537"/>
5
   <feature url="features/org.txm.annotation.feature_1.0.0.201806251537.jar" id="org.txm.annotation.feature" version="1.0.0.201806251537"/>
6
   <feature url="features/org.txm.annotation.kr.feature_1.0.0.201806251537.jar" id="org.txm.annotation.kr.feature" version="1.0.0.201806251537"/>
7
   <feature url="features/org.txm.backtomedia.feature_1.0.0.201806251537.jar" id="org.txm.backtomedia.feature" version="1.0.0.201806251537"/>
8
   <feature url="features/org.txm.ca.feature_1.0.0.201806251537.jar" id="org.txm.ca.feature" version="1.0.0.201806251537"/>
9
   <feature url="features/org.txm.chartsengine.jfreechart.feature_1.0.0.201806251537.jar" id="org.txm.chartsengine.jfreechart.feature" version="1.0.0.201806251537"/>
10
   <feature url="features/org.txm.chartsengine.r.feature_1.0.0.201806251537.jar" id="org.txm.chartsengine.r.feature" version="1.0.0.201806251537"/>
11
   <feature url="features/org.txm.concordance.feature_1.0.0.201806251537.jar" id="org.txm.concordance.feature" version="1.0.0.201806251537"/>
12
   <feature url="features/org.txm.cooccurrence.feature_1.0.0.201806251537.jar" id="org.txm.cooccurrence.feature" version="1.0.0.201806251537"/>
13
   <feature url="features/org.txm.cql2lsa.feature_1.0.0.201806251537.jar" id="org.txm.cql2lsa.feature" version="1.0.0.201806251537"/>
14
   <feature url="features/org.txm.dictionary.feature_1.0.0.201806251537.jar" id="org.txm.dictionary.feature" version="1.0.0.201806251537"/>
15
   <feature url="features/org.txm.edition.feature_1.0.0.201806251537.jar" id="org.txm.edition.feature" version="1.0.0.201806251537"/>
16
   <feature url="features/org.txm.groovy.feature_1.0.0.201806251537.jar" id="org.txm.groovy.feature" version="1.0.0.201806251537"/>
17
   <feature url="features/org.txm.index.feature_1.0.0.201806251537.jar" id="org.txm.index.feature" version="1.0.0.201806251537"/>
18
   <feature url="features/org.txm.internalview.feature_1.0.0.201806251537.jar" id="org.txm.internalview.feature" version="1.0.0.201806251537"/>
19
   <feature url="features/org.txm.lexicaltable.feature_1.0.0.201806251537.jar" id="org.txm.lexicaltable.feature" version="1.0.0.201806251537"/>
20
   <feature url="features/org.txm.partition.feature_1.0.0.201806251537.jar" id="org.txm.partition.feature" version="1.0.0.201806251537"/>
21
   <feature url="features/org.txm.progression.feature_1.0.0.201806251537.jar" id="org.txm.progression.feature" version="1.0.0.201806251537"/>
22
   <feature url="features/org.txm.properties.feature_1.0.0.201806251537.jar" id="org.txm.properties.feature" version="1.0.0.201806251537"/>
23
   <feature url="features/org.txm.rcp.feature_1.0.0.201806251537.jar" id="org.txm.rcp.feature" version="1.0.0.201806251537"/>
24
   <feature url="features/org.txm.referencer.feature_1.0.0.201806251537.jar" id="org.txm.referencer.feature" version="1.0.0.201806251537"/>
25
   <feature url="features/org.txm.searchengine.cqp.feature_1.0.0.201806251537.jar" id="org.txm.searchengine.cqp.feature" version="1.0.0.201806251537"/>
26
   <feature url="features/org.txm.specificities.feature_1.0.0.201806251537.jar" id="org.txm.specificities.feature" version="1.0.0.201806251537"/>
27
   <feature url="features/org.txm.statsengine.r.feature_1.0.0.201806251537.jar" id="org.txm.statsengine.r.feature" version="1.0.0.201806251537"/>
28
   <feature url="features/org.txm.svn.feature_1.0.0.201806251537.jar" id="org.txm.svn.feature" version="1.0.0.201806251537"/>
29
   <feature url="features/org.txm.textbalance.feature_1.0.0.201806251537.jar" id="org.txm.textbalance.feature" version="1.0.0.201806251537"/>
30
   <feature url="features/org.txm.tigersearch.feature_1.0.0.201806251537.jar" id="org.txm.tigersearch.feature" version="1.0.0.201806251537"/>
31
   <feature url="features/org.txm.treetagger.feature_1.0.0.201806251537.jar" id="org.txm.treetagger.feature" version="1.0.0.201806251537"/>
32
   <feature url="features/org.txm.treetagger.files.feature_1.0.0.201806251537.jar" id="org.txm.treetagger.files.feature" version="1.0.0.201806251537"/>
33
   <feature url="features/org.txm.wordcloud.feature_1.0.0.201806251537.jar" id="org.txm.wordcloud.feature" version="1.0.0.201806251537"/>
34 34
</site>
tmp/org.txm.treetagger.core/plugin.xml (revision 927)
7 7
            class="org.txm.treetagger.core.preferences.TreeTaggerPreferences">
8 8
      </initializer>
9 9
   </extension>
10
   <extension
11
         point="org.txm.annotation.core.AnnotationEngine">
12
      <AnnotationEngine
13
            class="org.txm.treetagger.core.TreeTaggerEngine"
14
            description="Call TreeTagger treetagger train processus to annotate corpus">
15
      </AnnotationEngine>
16
   </extension>
10 17

  
11 18
</plugin>
tmp/org.txm.treetagger.core/.settings/org.eclipse.jdt.core.prefs (revision 927)
1 1
eclipse.preferences.version=1
2 2
org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
3
org.eclipse.jdt.core.compiler.codegen.methodParameters=do not generate
3 4
org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6
5
org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve
4 6
org.eclipse.jdt.core.compiler.compliance=1.6
7
org.eclipse.jdt.core.compiler.debug.lineNumber=generate
8
org.eclipse.jdt.core.compiler.debug.localVariable=generate
9
org.eclipse.jdt.core.compiler.debug.sourceFile=generate
5 10
org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
6 11
org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
7 12
org.eclipse.jdt.core.compiler.source=1.6
tmp/org.txm.treetagger.core/.settings/org.eclipse.jdt.groovy.core.prefs (revision 927)
1
eclipse.preferences.version=1
2
groovy.compiler.level=25
3
groovy.script.filters=**/*.dsld,y,**/*.gradle,n
0 4

  
tmp/org.txm.treetagger.core/.classpath (revision 927)
7 7
		</accessrules>
8 8
	</classpathentry>
9 9
	<classpathentry kind="src" path="src"/>
10
	<classpathentry exported="true" kind="con" path="GROOVY_SUPPORT"/>
11
	<classpathentry exported="true" kind="con" path="GROOVY_DSL_SUPPORT"/>
10 12
	<classpathentry kind="output" path="bin"/>
11 13
</classpath>
tmp/org.txm.treetagger.core/META-INF/MANIFEST.MF (revision 927)
1 1
Manifest-Version: 1.0
2
Require-Bundle: org.txm.core;bundle-version="0.8.0";visibility:=reexport
3
Export-Package: org.txm.treetagger.core.preferences
2
Require-Bundle: org.txm.annotation.core;visibility:=reexport
3
Export-Package: org.txm.treetagger.core,
4
 org.txm.treetagger.core.preferences
4 5
Bundle-ActivationPolicy: lazy
5 6
Bundle-Version: 1.0.0.qualifier
6 7
Bundle-Name: TreeTagger Core
tmp/org.txm.treetagger.core/.project (revision 927)
22 22
		</buildCommand>
23 23
	</buildSpec>
24 24
	<natures>
25
		<nature>org.eclipse.jdt.groovy.core.groovyNature</nature>
25 26
		<nature>org.eclipse.pde.PluginNature</nature>
26 27
		<nature>org.eclipse.jdt.core.javanature</nature>
27 28
	</natures>
tmp/org.txm.treetagger.core/src/org/txm/importer/xmltxm/AnnotateWTC.groovy (revision 927)
1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2013-12-06 16:57:35 +0100 (Fri, 06 Dec 2013) $
25
// $LastChangedRevision: 2583 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.importer.xmltxm
29

  
30
import java.io.File
31
import java.text.DateFormat
32
import java.util.Date
33

  
34
import org.txm.Toolbox
35
import org.txm.importer.*
36
import org.txm.importer.cwb.*
37
import org.txm.objects.*
38
import org.txm.stat.utils.ConsoleProgressBar
39
import org.txm.utils.DeleteDir;
40
import org.txm.utils.LangDetector;
41
import org.txm.utils.treetagger.TreeTagger
42

  
43
// TODO: Auto-generated Javadoc
44
/**
45
 * Annotate and replace the TEI-TXM files of the folder $rootDirFile/txm with TreeTagger.
46
 * creates $rootDirFile/interp and $rootDirFile/treetagger
47
 *
48
 */
49
class AnnotateCQP {
50

  
51
	/** The debug. */
52
	boolean debug = true;
53

  
54
	/**
55
	 * Sets the debug.
56
	 */
57
	public void setDebug() { debug=true; }
58

  
59
	String id;
60

  
61
	/**
62
	 * Apply tt.
63
	 *
64
	 * @param ttsrcfile the ttsrcfile
65
	 * @param ttoutfile the ttoutfile
66
	 * @param modelfile the modelfile
67
	 * @return true, if successful
68
	 */
69
	public boolean applyTT(File ttsrcfile, File ttoutfile, File modelfile)
70
	{
71
		try {
72
			File infile = ttsrcfile;
73
			File outfile = ttoutfile;
74

  
75
			// TODO: why org.txm.utils.treetagger.TreeTagger tt = ..., throw IllegalAccessError ???
76
			String opt = Toolbox.getPreference(Toolbox.TREETAGGER_OPTIONS);
77
			if (opt == null) opt = "";
78
			String[] options = opt.split("  ");
79
			def tt = new TreeTagger(Toolbox.getPreference(Toolbox.TREETAGGER_INSTALL_PATH)+"/bin/", options);
80
			tt.settoken();
81
			tt.setlemma();
82
			tt.setquiet();
83
			tt.setsgml();
84
			tt.setnounknown();
85
			tt.seteostag("<s>");
86
			tt.treetagger(modelfile.getAbsolutePath(), infile.getAbsolutePath(), outfile.getAbsolutePath())
87
			//infile.delete();
88
		}
89
		catch(Exception e)
90
		{
91
			e.printStackTrace();
92
			return false;
93
		}
94
		return true;
95
	}
96

  
97
	/**
98
	 * Run.
99
	 *
100
	 * @param rootDirFile the root dir file
101
	 * @param modelfilename the modelfilename
102
	 * @return true, if successful
103
	 */
104
	public boolean run(File binDir, File txmDir,  String modelfilename)
105
	{
106
		//test if modelfile exists
107
		if(debug) {
108
			println "rootDirFile "+binDir
109
			println "txmDir "+txmDir
110
			println "TREETAGGER INSTALL PATH : "+Toolbox.getPreference(Toolbox.TREETAGGER_INSTALL_PATH)
111
			println "TREETAGGER MODELS PATH : "+Toolbox.getPreference(Toolbox.TREETAGGER_MODELS_PATH)
112
		}
113

  
114
		//test if the Toolbox know TreeTagger
115
		if (!new File(Toolbox.getPreference(Toolbox.TREETAGGER_INSTALL_PATH)+"/bin/").exists())
116
		{
117
			println("Could not find TreeTagger binaries in "+Toolbox.getPreference(Toolbox.TREETAGGER_INSTALL_PATH)+"/bin/")
118
			return false;
119
		}
120

  
121
		//cleaning
122
		File annotDir = new File(binDir, "annotations")
123
		annotDir.mkdir();
124

  
125
		//BUILD TT FILE READY TO BE TAGGED
126
		List<File> files = txmDir.listFiles()
127

  
128
		// get model file and check it
129
		File modelfile = new File(Toolbox.getPreference(Toolbox.TREETAGGER_MODELS_PATH), modelfilename);
130
		if (debug)
131
			println "model file : "+modelfile;
132

  
133
		if (!modelfile.exists()) {
134
			println "Skipping ANNOTATE: Incorrect modelfile path: "+modelfile;
135
			if(System.getProperty("os.name").startsWith("Windows"))
136
				println "Windows users: Windows might be hiding files extension. To see them, in the explorer parameters."
137
			return false;
138
		}
139

  
140
		//APPLY TREETAGGER
141
		println("Applying $modelfilename TreeTagger model on dir: "+txmDir+ " ("+files.size()+" files)")
142
		if (files == null || files.size() == 0)
143
			return false;
144

  
145
		ConsoleProgressBar cpb = new ConsoleProgressBar(files.size());
146
		for (File f : files) {
147
			cpb.tick();
148
			
149
			File infile = f;
150
			File outfile = new File(new File(binDir, "annotations"), f.getName());
151
			if (outfile.exists() && // outfile exists
152
				outfile.lastModified() >= infile.lastModified() && // outfile is more recent 
153
				outfile.length() > infile.length()) { // outfile is bigger
154
				// skip
155
			} else {
156
				if (!applyTT(infile, outfile, modelfile)) {
157
					System.out.println("Failed to apply treetagger on file "+f);
158
					return false;
159
				}
160
			}
161
		}
162
		println("")
163

  
164
//		if (DeleteDir.deleteDirectory(txmDir)) {
165
//			if (annotDir.renameTo(txmDir)) {
166
//
167
//			} else {
168
//				println "Could not rename  'annotations' directory to 'txm' directory"
169
//				return false
170
//			}
171
//		} else {
172
//			println "Could not delete txmDir: $txmDir (and could not copy 'annotations' directory"
173
//		}
174

  
175
		return true;
176
	}
177
}
tmp/org.txm.treetagger.core/src/org/txm/importer/xmltxm/Annotate.groovy (revision 927)
1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2016-10-03 15:30:36 +0200 (lun. 03 oct. 2016) $
25
// $LastChangedRevision: 3313 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.importer.xmltxm
29

  
30
import java.io.File
31
import java.text.DateFormat
32
import java.util.Date
33
import java.util.concurrent.*
34

  
35
import org.txm.Toolbox
36
import org.txm.importer.*
37
import org.txm.importer.cwb.*
38
import org.txm.objects.*
39
import org.txm.stat.utils.ConsoleProgressBar;
40
import org.txm.treetagger.core.preferences.TreeTaggerPreferences
41
import org.txm.utils.LangDetector;
42
import org.txm.utils.logger.Log;
43
import org.txm.utils.treetagger.TreeTagger
44

  
45
/**
46
 * Annotate and replace the TEI-TXM files of the folder $rootDirFile/txm with TreeTagger.
47
 * creates $rootDirFile/interp and $rootDirFile/treetagger
48
 *
49
 */
50
class Annotate {
51
	boolean cancelNow = false;
52

  
53
	/** The report file. */
54
	File reportFile;//contains the txm:application tag content
55

  
56
	/** The resp person. */
57
	String respPerson;
58

  
59
	/** The resp id. */
60
	String respId;
61

  
62
	/** The resp desc. */
63
	String respDesc;
64

  
65
	/** The resp date. */
66
	String respDate;
67

  
68
	/** The resp when. */
69
	String respWhen;
70

  
71
	/** The app ident. */
72
	String appIdent;
73

  
74
	/** The app version. */
75
	String appVersion;
76

  
77
	/** The distributor. */
78
	String distributor;
79

  
80
	/** The publi stmt. */
81
	String publiStmt;
82

  
83
	/** The source stmt. */
84
	String sourceStmt;
85

  
86
	/** The types. */
87
	def types;
88

  
89
	/** The types title. */
90
	def typesTITLE;
91

  
92
	/** The types desc. */
93
	def typesDesc;
94

  
95
	/** The types tagset. */
96
	def typesTAGSET;
97

  
98
	/** The types web. */
99
	def typesWEB;
100

  
101
	/** The idform. */
102
	String idform;
103

  
104
	/** The debug. */
105
	boolean debug = false;
106

  
107
	File modelsDirectory;
108

  
109
	public Annotate() {
110
		modelsDirectory = new File(TreeTaggerPreferences.getString(TreeTaggerPreferences.MODELS_PATH, TreeTaggerPreferences.PREFERENCES_NODE)); // default models directory is set in the Toolbox
111
	}
112

  
113
	/**
114
	 * Sets the debug.
115
	 */
116
	public void setDebug() {
117
		debug = true;
118
	}
119

  
120
	String id;
121
	/**
122
	 * Inits the tt outfile infos.
123
	 *
124
	 * @param rootDirFile the root dir file
125
	 * @param modelfile the modelfile
126
	 */
127
	public void initTTOutfileInfos(File rootDirFile, File modelfile, String modelfilename)
128
	{
129
		initTTOutfileInfos(rootDirFile, modelfile, modelfilename, null);
130
	}
131

  
132
	/**
133
	 * Inits the tt outfile infos.
134
	 *
135
	 * @param rootDirFile the root dir file
136
	 * @param modelfile the modelfile
137
	 * @param properties : 2 element array that contains the word properties to create. It can be null (the modelfilename will be used)
138
	 */
139
	public void initTTOutfileInfos(File rootDirFile, File modelfile, String modelfilename, String[] properties)
140
	{
141
		id = modelfilename;
142
		String[] split = id.split("\\.");
143
		if (split.length > 0) id = split[0];
144
		if (id.equals("??")) id = "xx"
145

  
146
		reportFile = new File(rootDirFile,"NLPToolsParameters.xml");
147

  
148
		respPerson = System.getProperty("user.name");
149
		respId = "txm";
150
		respDesc = "NLP annotation tool";
151
		respDate = DateFormat.getDateInstance(DateFormat.SHORT, Locale.UK).format(new Date());
152
		respWhen = DateFormat.getDateInstance(DateFormat.FULL, Locale.UK).format(new Date());
153

  
154
		appIdent = "TreeTagger";
155
		appVersion = "3.2";
156

  
157
		distributor = "";
158
		publiStmt = """""";
159
		sourceStmt = """""";
160

  
161
		if (properties != null && properties.length == 2) {
162
			types = [properties[0],properties[1]];
163
			typesTITLE = [properties[0],properties[1]];
164
		} else {
165
			types = [id+"pos",id+"lemma"];
166
			typesTITLE = [id+"pos",id+"lemma"];
167
		}
168

  
169
		//TODO: the tagset, website and description should be referenced in the model catalog
170
		if(modelfile.getName() == "rgaqcj.par") {
171
			typesDesc = ["CATTEX pos tagset built with BFM texts","fr lemma of the model "+modelfile+" - "]
172
			typesTAGSET = ["http://bfm.ens-lyon.fr/IMG/pdf/Cattex2009_Manuel.pdf",""]
173
			typesWEB = ["http://bfm.ens-lyon.fr/",""]
174
		} else {
175
			typesDesc = ["pos tagset built from model "+modelfile,id+" lemma of the model "+modelfile+" - "]
176
			typesTAGSET = ["",""]
177
			typesWEB = ["",""]
178
		}
179

  
180
		idform ="w";
181
	}
182

  
183
	/**
184
	 * Apply tt.
185
	 *
186
	 * @param ttsrcfile the ttsrcfile
187
	 * @param ttoutfile the ttoutfile
188
	 * @param modelfile the modelfile
189
	 * @return true, if successful
190
	 */
191
	public boolean applyTT(File ttsrcfile, File ttoutfile, File modelfile) {
192
		applyTT(ttsrcfile, ttoutfile, modelfile, null)
193
	}
194

  
195
	/**
196
	 * Apply tt.
197
	 *
198
	 * @param ttsrcfile the ttsrcfile
199
	 * @param ttoutfile the ttoutfile
200
	 * @param modelfile the modelfile
201
	 * @param options, if null use value set in Toolbox preferences
202
	 * @return true, if successful
203
	 */
204
	public boolean applyTT(File ttsrcfile, File ttoutfile, File modelfile, String[] options)
205
	{
206
		try {
207
			File infile = ttsrcfile;
208
			File outfile = ttoutfile;
209

  
210
			// TODO: why org.txm.utils.treetagger.TreeTagger tt = ..., throw IllegalAccessError ???
211
			String opt = TreeTaggerPreferences.getString(TreeTaggerPreferences.OPTIONS, TreeTaggerPreferences.PREFERENCES_NODE);
212
			if (options == null) {
213
				if (opt == null) opt = "";
214
				options = opt.split("  ");
215
			}
216

  
217
			def tt = new TreeTagger(TreeTaggerPreferences.getString(TreeTaggerPreferences.INSTALL_PATH, TreeTaggerPreferences.PREFERENCES_NODE)+"/bin/", options);
218
			tt.settoken();
219
			tt.setlemma();
220
			tt.setquiet();
221
			tt.setsgml();
222
			tt.setnounknown();
223
			tt.seteostag("<s>");
224
			tt.debug(debug);
225
			tt.treetagger(modelfile.getAbsolutePath(), infile.getAbsolutePath(), outfile.getAbsolutePath())
226
			infile.delete();
227
		} catch(Exception e) {
228
			Log.printStackTrace(e);
229
			System.out.println("Failed to apply TreeTagger on $f input file with the $modelfile model file.");
230
			return false;
231
		}
232
		return true;
233
	}
234

  
235
	/**
236
	 * Write standoff file.
237
	 *
238
	 * @param ttoutfile the ttoutfile
239
	 * @param posfile the posfile
240
	 * @return true, if successful
241
	 */
242
	public boolean writeStandoffFile(File ttoutfile, File posfile)
243
	{
244
		def encoding ="UTF-8";
245
		def transfo = new CSV2W_ANA();
246
		//println("build w-interp "+ttfile.getName()+ ">>"+posfile.getName())
247
		transfo.setAnnotationTypes( types, typesDesc, typesTAGSET, typesWEB, idform);
248
		transfo.setResp(respId, respDesc, respDate, respPerson, respWhen);
249
		transfo.setApp(appIdent, appVersion);
250
		transfo.setTarget(ttoutfile.getAbsolutePath(), reportFile);
251
		transfo.setInfos(distributor,  publiStmt, sourceStmt);
252
		return transfo.process( ttoutfile, posfile, encoding );
253
	}
254

  
255
	/**
256
	 * Run step by step : build TT src files, run TT, build xml-standoff files, inject standoff annotations
257
	 *
258
	 * @param rootDirFile the root dir file
259
	 * @param modelfilename the modelfilename
260
	 * @return true, if successful
261
	 */
262
	public boolean run(File binDir, File txmDir,  String modelfilename)
263
	{
264
		//test if modelfile exists
265
		if (debug) {
266
			println "rootDirFile "+binDir
267
			println "txmDir "+txmDir
268
			println "TREETAGGER INSTALL PATH : "+TreeTaggerPreferences.getString(TreeTaggerPreferences.INSTALL_PATH, TreeTaggerPreferences.PREFERENCES_NODE);
269
			println "TREETAGGER MODELS PATH : "+TreeTaggerPreferences.getString(TreeTaggerPreferences.MODELS_PATH, TreeTaggerPreferences.PREFERENCES_NODE)
270
		}
271

  
272
		//test if the Toolbox know TreeTagger
273
		if (!new File(TreeTaggerPreferences.getString(TreeTaggerPreferences.INSTALL_PATH, TreeTaggerPreferences.PREFERENCES_NODE)+"/bin/").exists()) {
274
			println("Could not find TreeTagger binaries in "+TreeTaggerPreferences.getString(TreeTaggerPreferences.INSTALL_PATH, TreeTaggerPreferences.PREFERENCES_NODE)+"/bin/")
275
			return false;
276
		}
277
		String langAll = null
278
		String lang;
279
		if (modelfilename.startsWith("??")) {
280
			langAll = new LangDetector(binDir).getLang();
281
			println "General lang $langAll"
282
		}
283

  
284
		//cleaning
285
		new File(binDir, "annotations").deleteDir();
286
		new File(binDir, "annotations").mkdir();
287
		new File(binDir, "treetagger").deleteDir();
288
		new File(binDir, "treetagger").mkdir();
289

  
290
		ArrayList<String> milestones = [];
291

  
292
		//BUILD TT FILE READY TO BE TAGGED
293
		List<File> files = txmDir.listFiles()
294

  
295
		println("Building TT source files ("+files.size()+") from directory "+txmDir)
296
		ConsoleProgressBar cpb = new ConsoleProgressBar(files.size())
297
		for (File f : files) {
298
			cpb.tick()
299
			File srcfile = f;
300
			File resultfile = new File(binDir, "treetagger/"+f.getName()+".tt");
301
			if(debug)
302
				println "build tt src : "+srcfile+" >> "+resultfile
303
			def ttsrcbuilder = new BuildTTSrc(srcfile.toURI().toURL())
304
			if (!ttsrcbuilder.process(resultfile, null))
305
				System.out.println("Failed to build tt src file of "+srcfile);
306
		}
307

  
308
		if (cancelNow) return;
309

  
310
		File modelDirectory = new File(TreeTaggerPreferences.getString(TreeTaggerPreferences.MODELS_PATH, TreeTaggerPreferences.PREFERENCES_NODE));
311
		if (!modelDirectory.exists()) {
312
			println "Skipping ANNOTATE: TreeTagger language model file directory not found: "+modelDirectory.getAbsolutePath();
313
			return false;
314
		} else 	if (!modelDirectory.canRead()) {
315
			println "Skipping ANNOTATE: impossible to access the TreeTagger language model file directory: "+modelDirectory.getAbsolutePath();
316
			return false;
317
		}
318
		println("")
319
		//Convert encoding if needed
320

  
321
		//APPLY TREETAGGER
322
		files = new File(binDir, "treetagger").listFiles()
323
		println("Applying $modelfilename TreeTagger model on dir: "+new File(binDir, "treetagger")+ " ("+files.size()+" files)")
324
		if (files == null || files.size() == 0)
325
			return false;
326
		File modelfile;
327
		cpb = new ConsoleProgressBar(files.size())
328
		for (File f : files) {
329
			String tmpModelFileName = modelfilename
330
			if (modelfilename.startsWith("??")) {
331
				lang = langAll;
332
				if (f.length() > LangDetector.MINIMALSIZE) {
333
					lang = new LangDetector(f).getLang();
334
					//println "guessing lang $f : $lang"
335
				}
336
				tmpModelFileName = lang+".par"
337
			}
338
			modelfile = new File(modelsDirectory, tmpModelFileName);
339
			if (debug)
340
				println "model file : "+modelfile;
341

  
342
			File
343

  
344
			if (!modelfile.exists()) {
345
				println "Skipping ANNOTATE: '$modelfile' TreeTagger language model file not found."
346
				if(System.getProperty("os.name").startsWith("Windows") || System.getProperty("os.name").startsWith("Mac"))
347
					println "Windows&Mac users: the operating system might be hiding file extensions. Use your file explorer to check the file name."
348
				return false;
349
			} else if (!modelfile.canRead()) {
350
				println "Skipping ANNOTATE: impossible to access the '$modelfile' TreeTagger language model file."
351
				return false;
352
			}
353

  
354
			//			if (modelfile.getName().equals("sp.par")) {//UTF >> Latin1
355
			//				if(debug)
356
			//					println "fix encoding for model "+modelfile
357
			//				new EncodingConverter(f, "UTF-8", "ISO-8859-1")
358
			//			}
359

  
360
			cpb.tick()
361
			File infile = f;
362
			File outfile = new File(f.getParent(),f.getName()+"-out.tt");
363
			if (!applyTT(infile, outfile, modelfile)) {
364
				return false;
365
			}
366

  
367
			//			//Reconvert encoding if needed
368
			//			if (modelfile.getName().equals("sp.par")) {
369
			//				if(debug)
370
			//					println "convert "+f+" latin1 >> UTF-8"
371
			//				new EncodingConverter(f, "ISO-8859-1", "UTF-8")
372
			//			}
373
		}
374
		println("")
375

  
376
		if (cancelNow) return;
377

  
378
		//BUILD STAND-OFF FILES
379
		if (modelfile == null) {
380
			println "no model applied"
381
			return false;
382
		}
383

  
384
		initTTOutfileInfos(binDir, modelfile, modelfilename);
385
		files = new File(binDir, "treetagger").listFiles()// now contains the result files of TT
386
		println "Building stdoff files ("+files.size()+") from dir:"+new File(binDir, "treetagger")+" to "+new File(binDir, "annotations");
387
		if (files == null || files.size() == 0)
388
			return false;
389
		cpb = new ConsoleProgressBar(files.size())
390
		for (File ttrezfile : files) {
391
			cpb.tick()
392
			File annotfile = new File(binDir, "annotations/"+ttrezfile.getName()+"-STOFF.xml");
393
			if (!writeStandoffFile(ttrezfile, annotfile))
394
				println("Failed to build standoff file of "+ttrezfile);
395
		}
396
		println("")
397

  
398
		if (cancelNow) return;
399

  
400
		//INJECT ANNOTATIONS
401
		List<File> interpfiles = new File(binDir, "annotations").listFiles();
402
		List<File> txmfiles = txmDir.listFiles();
403
		if (txmfiles == null) {
404
			println "No file to annotate in "+txmDir.getAbsolutePath()
405
			return false;
406
		}
407
		interpfiles.sort(); // same order
408
		txmfiles.sort(); //same order
409
		println "Injecting stdoff files ("+interpfiles.size()+") data from "+new File(binDir, "annotations")+ " to xml-txm files of "+txmDir;
410
		if (interpfiles == null || interpfiles.size() == 0)
411
			return false;
412
		cpb = new ConsoleProgressBar(interpfiles.size())
413
		for (int i = 0 ; i < interpfiles.size() ; i++) {
414
			cpb.tick()
415
			File srcfile = txmfiles.get(i);
416
			File pos1file = interpfiles.get(i);
417
			File temp = File.createTempFile("Annotate", "temp", srcfile.getParentFile());
418
			def builder = new AnnotationInjection(srcfile.toURI().toURL(), pos1file.toURI().toURL());
419
			if (!builder.process(temp)) {
420
				return false;
421
			}
422
			builder = null;
423

  
424
			//println "renaming files..."
425
			if (!(srcfile.delete() && temp.renameTo(srcfile)))
426
				println "Warning can't rename file "+temp+" to "+srcfile
427
		}
428
		println("")
429
		return true;
430
	}
431

  
432
	public void setModelsDirectory(File modelsDirectory) {
433
		this.modelsDirectory = modelsDirectory;
434
	}
435

  
436
	/**
437
	 * Run file by file. Allow to have one different lang per file. Default behavior add new word properties
438
	 *
439
	 * @param binDir
440
	 * @param txmDir
441
	 * @param lang associate a file name with a model filename
442
	 * @return true, if successful
443
	 */
444
	public boolean run(File binDir, File txmDir, HashMap<String, String> langs)
445
	{
446
		return run(binDir, txmDir, langs, false, new String[0], new String[0]);
447
	}
448

  
449
	/**
450
	 * Run file by file. Allow to have one different lang per file
451
	 *
452
	 * @param binDir 
453
	 * @param txmDir
454
	 * @param lang associate a file name with a model filename
455
	 * @param replace, replace or create a word property
456
	 * @return true, if successful
457
	 */
458
	public boolean run(File binDir, File txmDir, HashMap<String, String> langs, boolean replace, String[] properties, String[] options)
459
	{
460
		if (!new File(TreeTaggerPreferences.getString(TreeTaggerPreferences.INSTALL_PATH, TreeTaggerPreferences.PREFERENCES_NODE)+"/bin/").exists()) {
461
			println("Path to TreeTagger is wrong "+TreeTaggerPreferences.getString(TreeTaggerPreferences.INSTALL_PATH, TreeTaggerPreferences.PREFERENCES_NODE)+"/bin/")
462
			return true;
463
		}
464

  
465
		List<File> listfiles = txmDir.listFiles();
466

  
467
		//cleaning
468
		File annotDir = new File(binDir,"annotations");
469
		annotDir.deleteDir();
470
		annotDir.mkdir();
471
		File ptreetaggerDir = new File(binDir,"ptreetagger");
472
		ptreetaggerDir.deleteDir();
473
		ptreetaggerDir.mkdir();
474
		File treetaggerDir = new File(binDir,"treetagger");
475
		treetaggerDir.deleteDir();
476
		treetaggerDir.mkdir();
477

  
478
		int cores = Runtime.getRuntime().availableProcessors()
479
		int coresToUse = Math.max(1.0, cores * 0.7)
480
		ExecutorService pool = Executors.newFixedThreadPool(coresToUse)
481

  
482
		def files = txmDir.listFiles()
483
		ConsoleProgressBar cpb = new ConsoleProgressBar(files.size())
484
		for (File teiFile : files) {
485
			int counter = 1;
486
			ThreadFile t = new ThreadFile("TT_"+counter++, teiFile) {
487

  
488
						public void run() {
489

  
490
							if (cancelNow) return;
491
							if (langs.get(f.getName()) == null) {
492
								println "Error: no lang defined for file $f"
493
								return;
494
							}
495

  
496
							String lang = langs.get(f.getName());
497
							run(f, lang, binDir, txmDir, replace, properties, options, annotDir, ptreetaggerDir, treetaggerDir)
498

  
499
							cpb.tick();
500
						}
501
					};
502

  
503
			pool.execute(t)
504
		}
505

  
506
		pool.shutdown()
507
		pool.awaitTermination(10, TimeUnit.HOURS)
508
		println ""
509
		return true;
510
	}
511
	public boolean run(File f, String lang, File binDir, File txmDir) {
512

  
513
		File annotDir = new File(binDir,"annotations");
514
		annotDir.mkdir();
515
		File ptreetaggerDir = new File(binDir,"ptreetagger");
516
		ptreetaggerDir.mkdir();
517
		File treetaggerDir = new File(binDir,"treetagger");
518
		treetaggerDir.mkdir();
519

  
520
		return run(f, lang, binDir, txmDir, false, new String[0], new String[0], annotDir, ptreetaggerDir, treetaggerDir)
521
	}
522

  
523

  
524
	public boolean run(File f, String lang, File binDir, File txmDir, boolean replace, String[] properties, String[] options, File annotDir, File ptreetaggerDir, File treetaggerDir) {
525

  
526
		File modelfile = new File(modelsDirectory, lang+".par");
527
		if (!modelfile.exists()) {
528
			println "Error: No Modelfile available for lang "+modelfile+". Continue import process ";
529
			return false;
530
		}
531
		File annotfile = new File(annotDir, f.getName()+"-STDOFF.xml");
532
		File ttsrcfile = new File(ptreetaggerDir, f.getName()+"-src.tt");
533
		File ttrezfile = new File(treetaggerDir, f.getName()+"-out.tt");
534
		//println ("TT with $model "+f+"+"+annotfile+" > "+ttsrcfile+" > "+ttrezfile);
535

  
536
		//BUILD TT FILE READY TO BE TAGGED
537
		def builder = new BuildTTSrc(f.toURL());
538
		builder.process(ttsrcfile, null);
539

  
540
		//Apply TT
541
		applyTT(ttsrcfile, ttrezfile, modelfile, options);
542

  
543
		//CREATE STANDOFF FILES
544
		initTTOutfileInfos(binDir, modelfile, lang, properties);
545
		writeStandoffFile(ttrezfile, annotfile)
546

  
547
		//INJECT ANNOTATIONS
548
		File tmpFile = new File(txmDir, "temp_"+f.getName())
549
		builder = new AnnotationInjection(f.toURL(), annotfile.toURL(), replace);
550
		builder.process(tmpFile);
551
		if (!(f.delete() && tmpFile.renameTo(f))) println "Warning can't rename file "+tmpFile+" to "+f
552

  
553
		return f.exists();
554
	}
555

  
556
	public void setCancelNow() {
557
		cancelNow = true;
558
	}
559

  
560
	public class ThreadFile extends Thread {
561
		File f;
562
		public ThreadFile(String name, File f) {
563
			this.setName(name)
564
			this.f = f;
565
		}
566
	}
567
}
tmp/org.txm.treetagger.core/src/org/txm/importer/xmltxm/AnnotationInjectionFromTTOutput.groovy (revision 927)
1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2016-03-02 14:17:06 +0100 (mer., 02 mars 2016) $
25
// $LastChangedRevision: 3134 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.importer.xmltxm
29
import javax.xml.stream.*
30

  
31
import org.txm.importer.StaxIdentityParser
32
import org.txm.importer.filters.*
33
import org.txm.importer.PersonalNamespaceContext
34

  
35
/**
36
 * The Class AnnotationInjection.
37
 *
38
 * @author mdecorde
39
 * 
40
 * inject annotation from a stand-off file into a xml-tei-txm file
41
 */
42

  
43
public class AnnotationInjectionFromTTOutput extends StaxIdentityParser {
44

  
45
	public static String TXMNS = "http://textometrie.org/1.0"
46

  
47
	/** The xml reader factory. */
48
	private def factory;
49

  
50
	/** The links. */
51
	private List<String> links;
52

  
53
	/** The linkparsers. key=type*/
54
	private HashMap<String, XMLStreamReader> linkparsers;
55

  
56
	/** The anaurl. */
57
	private def anaurl;
58

  
59
	/** The anainput data. */
60
	private def anainputData;
61

  
62
	/** The anafactory. */
63
	private XMLInputFactory anafactory = XMLInputFactory.newInstance();
64

  
65
	/** The anaparser. */
66
	private XMLStreamReader anaparser;
67
	private XMLStreamReader headerparser;
68

  
69
	/** The resp stmt id. */
70
	String respStmtID = "";
71

  
72
	/** The present taxonomies. */
73
	ArrayList<String> presentTaxonomies = new ArrayList();
74

  
75
	/**
76
	 * Instantiates a new annotation injection.
77
	 *
78
	 * @param url the xml-tei-txm file
79
	 * @param anaurl the stand-off file
80
	 */
81
	public AnnotationInjectionFromTTOutput(URL url, URL anaurl) {
82
		super(url); // init reader and writer
83
		try {
84
			this.anaurl = anaurl;
85
			factory = XMLInputFactory.newInstance();
86
			this.buildLinkParsers();// build a parser per linkgroup
87
		} catch (XMLStreamException ex) {
88
			System.out.println(ex);
89
		} catch (IOException ex) {
90
			System.out.println("IOException while parsing ");
91
		}
92
	}
93

  
94
	private void getHeaderInfos(String containertag, boolean captureTheTag)
95
	{
96
		anainputData = new BufferedInputStream(anaurl.openStream());
97
		headerparser = anafactory.createXMLStreamReader(anainputData);
98
		boolean start = false;
99
		String localname;
100
		for (int event = headerparser.next(); event != XMLStreamConstants.END_DOCUMENT; event = headerparser.next())
101
		{
102
			//String prefix = headerparser.getNamespaceURI();
103
			if (event == XMLStreamConstants.START_ELEMENT) {
104
				localname = headerparser.getLocalName();
105
				if (captureTheTag && localname == containertag) // start copy  after the tag
106
					start = true;
107
				if (start) { // copy header
108
					String prefix = headerparser.getPrefix();
109
					if (prefix.length() > 0)
110
						writer.writeStartElement(Nscontext.getNamespaceURI(prefix), localname)
111
					else
112
						writer.writeStartElement(localname);
113
					for (int i = 0 ; i < headerparser.getNamespaceCount(); i++)
114
						writer.writeNamespace(headerparser.getNamespacePrefix(i), headerparser.getNamespaceURI(i));
115
					for (int i = 0 ; i < headerparser.getAttributeCount(); i++)
116
						writer.writeAttribute(headerparser.getAttributeLocalName(i), headerparser.getAttributeValue(i));
117
				}
118
				if (!captureTheTag && localname == containertag) // start copy  after the tag
119
					start = true;
120
			} else if (event == XMLStreamConstants.END_ELEMENT) {
121
				localname = headerparser.getLocalName();
122
				if (!captureTheTag && localname == containertag)
123
					break;// stop looping
124

  
125
				if (start)
126
					writer.writeEndElement();
127

  
128
				if (captureTheTag && localname == containertag)
129
					break;// stop looping
130
			} else if (event == XMLStreamConstants.CHARACTERS) {
131
				if (start)
132
					writer.writeCharacters(headerparser.getText());
133
			} else if (event == XMLStreamConstants.COMMENT) {
134
				if (start)
135
					writer.writeComment(headerparser.getText());
136
			}
137
		}
138
		headerparser.close();
139
	}
140

  
141
	/**
142
	 * find all refs.
143
	 *
144
	 * @return the list of link parser
145
	 */
146
	private List<String> findGrpLink()
147
	{
148
		ArrayList<String> links = new ArrayList<String>();
149
		anainputData = anaurl.openStream();
150
		anaparser = anafactory.createXMLStreamReader(anainputData);
151

  
152
		for (int event = anaparser.next(); event != XMLStreamConstants.END_DOCUMENT; event = anaparser.next()) {
153
			if (event == XMLStreamConstants.START_ELEMENT) {
154
				if (anaparser.getLocalName().matches("linkGrp")) {
155
					String targetsvalue = anaparser.getAttributeValue(0)
156

  
157
					if (links.contains(targetsvalue)) {
158
						System.err.println("Warning: Multiple group declaration : "+targetsvalue+" has already been added, the first one will be used")
159
					} else {
160
						links.add(targetsvalue); // add the taxonomy type
161
					}
162
				} else if (anaparser.getLocalName().matches("respStmt")) {
163
					respStmtID = anaparser.getAttributeValue(0); // one attribute (id) only
164
				}
165
			}
166
		}
167
		anaparser.close();
168
		return links;
169
	}
170

  
171
	/**
172
	 * Builds the link parsers.
173
	 * I need to know what groups exists to build a parser per taxonomy and go to the first link element
174
	 */
175
	private void buildLinkParsers()
176
	{
177
		// link group of the standoff file
178
		links = findGrpLink();
179
		linkparsers = new HashMap<String, XMLStreamReader>();
180

  
181
		// build one parser per link group
182
		for (String link : links) { // build a parser per group
183
			anainputData = new BufferedInputStream(anaurl.openStream());
184
			linkparsers.put(link, anafactory.createXMLStreamReader(anainputData));
185
		}
186

  
187
		//for each parser
188
		for (String link : links) {
189
			anaparser = linkparsers.get(link);
190
			for (int event = anaparser.next(); event != XMLStreamConstants.END_DOCUMENT; event = anaparser.next()) {
191
				if (event == XMLStreamConstants.START_ELEMENT) {
192
					if (anaparser.getLocalName().equals("linkGrp")) { // position the parser to the right group
193
						String targetsvalue = anaparser.getAttributeValue(0)
194
						if (targetsvalue.equals(link))
195
							break; // next element is a link start tag
196
					}
197
				}
198
			}
199
		}
200
	}
201

  
202
	/**
203
	 * get the next tei:link value of a tei:LinkGrp.
204
	 *
205
	 * @param link the link
206
	 * @return the next ana
207
	 */
208
	private String getNextAnaValue(String link, String wordId)
209
	{
210
		anaparser = linkparsers.get(link);
211
		def m;
212
		for (int event = anaparser.next(); event != XMLStreamConstants.END_DOCUMENT; event = anaparser.next()) {
213
			if (event == XMLStreamConstants.START_ELEMENT) {
214
				if (anaparser.getLocalName().matches("link")) {
215
					String targetsvalue = anaparser.getAttributeValue(0)
216
					if ((m = targetsvalue =~ /#(.*) #(.*)/)) { // balise externe
217
						def g1 = m[0][1];
218
						def g2 = m[0][2];
219

  
220
						String anavalue = g2;
221
						anavalue = anavalue.replace("&lt;", "<")
222
						return anavalue;
223
					} else {
224
						System.err.println("Error: getNextAna(): link target is not well formed:  = "+anaparser.getAttributeValue(0));
225
					}
226
				}
227
			}
228
		}
229
		return "";
230
	}
231

  
232
	/**
233
	 * build the ana tags of a word.
234
	 *
235
	 * @param wordId the word id
236
	 * @return the ana tag
237
	 */
238
	private void writeAnaTag(String wordId)
239
	{
240
		String anabalises ="\n";
241
		for (String link : links) {
242
			writer.writeStartElement(TXMNS, "ana");
243
			writer.writeAttribute("resp", "#"+respStmtID);
244
			writer.writeAttribute("type", "#"+link);
245
			writer.writeCharacters(getNextAnaValue(link, wordId));
246
			writer.writeEndElement(); // txm:ana
247
		}
248
	}
249

  
250
	String wordId;
251
	boolean flagSourceDesc = false;
252
	protected void processStartElement()
253
	{
254
		if (parser.getLocalName().matches("taxonomy")) {
255
			String taxo = parser.getAttributeValue(0) // taxonomy type
256
			presentTaxonomies.add(taxo);
257
		}
258

  
259
		for (int i= 0 ; i < parser.getAttributeCount() ;i++ ) {
260
			if (parser.getAttributeLocalName(i) == "id") {
261
				wordId = parser.getAttributeValue(i);
262
				break
263
			}
264
		}
265
		super.processStartElement();
266
	}
267

  
268
	boolean applicationWritten = false;
269
	boolean taxonomiesWritten = false;
270
	protected void processEndElement() {
271
		switch (parser.getLocalName()) {
272
			case "w":
273
				writeAnaTag(wordId);
274
				break;
275

  
276
			case "appInfo":
277
				applicationWritten = true;
278
				getHeaderInfos("appInfo", false);
279
				break;
280

  
281
			case "classDecl":
282
				taxonomiesWritten = true;
283
				getHeaderInfos("classDecl", false);
284
				break;
285

  
286
			case "encodingDesc":
287
				if (!applicationWritten) {
288
					writer.writeStartElement("appInfo");
289
					getHeaderInfos("appInfo", false);
290
					writer.writeEndElement(); // appInfo
291
				}
292
				if (!taxonomiesWritten) {
293
					writer.writeStartElement("classDecl");
294
					getHeaderInfos("classDecl", false);
295
					writer.writeEndElement(); // classDecl
296
				}
297
				break;
298

  
299
			case "titleStmt":
300
				if (flagSourceDesc) {
301
					//output.write(this.respStmt+"\n")
302
					getHeaderInfos("respStmt", true);
303
					flagSourceDesc = false;
304
					break;
305
				}
306
				break;
307
		}
308
		super.processEndElement();
309
	}
310

  
311
	/** The declarenamespace. */
312
	boolean declarenamespace = false;
313

  
314
	/**
315
	 * Declare namespace.
316
	 *
317
	 * @return the java.lang. object
318
	 */
319
	private declareNamespace() {
320
		if (!declarenamespace) {
321
			writer.writeDefaultNamespace("http://www.tei-c.org/ns/1.0");
322
			writer.writeNamespace("txm", TXMNS);
323
			declarenamespace = true;
324
		}
325
	}
326

  
327
	/**
328
	 * The main method.
329
	 *
330
	 * @param args the arguments
331
	 */
332
	public static void main(String[] args) {
333

  
334
		String rootDir = "~/xml/rgaqcj/";
335
		new File(rootDir + "/injection/").mkdir();
336

  
337
		def milestones = [
338
			"tagUsage",
339
			"pb",
340
			"lb",
341
			"catRef"]// the tags who
342
		File srcfile = new File(rootDir, "/anainline/", "roland.xml");
343
		File pos1file = new File(rootDir, "/pos/", "rolandTT1-w-ana.xml");
344

  
345
		File src2file = new File(rootDir, "/injection/", "roland.xml");
346
		File pos2file = new File(rootDir, "/pos/", "rolandTT2-w-ana.xml");
347

  
348
		println("process file : " + srcfile + " with : " + pos1file);
349
		def builder = new AnnotationInjectionFromTTOutput(srcfile.toURI().toURL(),
350
				pos1file.toURI().toURL(), milestones);
351
		builder.transfomFile(new File(rootDir + "/injection/", "roland.xml"));
352

  
353
		println("process file : " + src2file + " with : " + pos1file);
354
		builder = new AnnotationInjectionFromTTOutput(src2file.toURI().toURL(), pos2file.toURI().toURL(),
355
				milestones);
356
		builder.transfomFile(rootDir + "/injection/", "roland-FINAL.xml");
357

  
358
		return;
359
	}
360
}
tmp/org.txm.treetagger.core/src/org/txm/importer/xmltxm/package.html (revision 927)
1
<html>
2
<body>
3
<p>Manages xml-tei-txm files</p>
4
</body>
5
</html>
0 6

  
tmp/org.txm.treetagger.core/src/org/txm/importer/xmltxm/BuildTTSrc.groovy (revision 927)
1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate: 2017-01-24 18:11:42 +0100 (mar. 24 janv. 2017) $
25
// $LastChangedRevision: 3400 $
26
// $LastChangedBy: mdecorde $ 
27
//
28
package org.txm.importer.xmltxm
29

  
30
import java.text.DateFormat;
31
import java.util.Date;
32
import java.util.ArrayList;
33
import javax.xml.stream.*;
34
import java.net.URL;
35

  
36
import org.txm.Toolbox;
37
import org.txm.importer.filters.*;
38
import org.txm.treetagger.core.preferences.TreeTaggerPreferences
39

  
40
// TODO: Auto-generated Javadoc
41
/**
42
 * The Class BuildTTSrc.
43
 *
44
 * @author mdecorde
45
 * build the TT source for tigerSearch
46
 */
47

  
48
public class BuildTTSrc {
49
	
50
	/** The url. */
51
	private def url;
52
	
53
	/** The input data. */
54
	private def inputData;
55
	
56
	/** The factory. */
57
	private def factory;
58
	
59
	/** The parser. */
60
	private XMLStreamReader parser;
61
	
62
	/** The output. */
63
	private BufferedWriter output;
64
	
65
	/**
66
	 * Instantiates a new builds the tt src.
67
	 * uses XML-TXM V2
68
	 *
69
	 * @param url the url of the file to process
70
	 */
71
	public BuildTTSrc(URL url) {
72
		try {
73
			this.url = url;
74
			inputData = url.openStream();
75
			factory = XMLInputFactory.newInstance();
76
			parser = factory.createXMLStreamReader(inputData);
77
			
78
		} catch (XMLStreamException ex) {
79
			System.out.println(ex);
80
		} catch (IOException ex) {
81
			System.out.println("IOException while parsing ");
82
		}
83
	}
84
	
85
	/**
86
	 * Creates the output.
87
	 *
88
	 * @param outfile the outfile
89
	 * @return true, if successful
90
	 */
91
	private boolean createOutput(File outfile) {
92
		try {
93
			File f = outfile;
94
			output = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outfile),
95
					"UTF-8"));
96
			return true;
97
		} catch (Exception e) {
98
			System.out.println(e.getLocalizedMessage());
99
			return false;
100
		}
101
	}
102
	
103
	/**
104
	 * Process.
105
	 *
106
	 * @param outfile the outfile
107
     * @param formtype, if multiple form, use this param to choose the correct one, if null takes the first form found
108
	 * @return true, if successful
109
	 */
110
	public boolean process(File outfile, String formtype) {
111
		if (!createOutput(outfile))
112
			return false;
113
		
114
		boolean flagform = false; // to catch the content of the form tag
115
		boolean firstform = false; // to know if its the first form of the w element
116
		String form = ""; // the content of the form tag
117
		String lastopenlocalname = "";
118
		String localname = "";
119
		StringBuffer buffer = new StringBuffer();
120
		try {
121
			for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
122
				switch (event) {
123
					case XMLStreamConstants.START_ELEMENT:
124
						localname = parser.getLocalName();
125
						switch (localname) {
126
							case "w":
127
								//firstform = true;
128
								break;
129
							case "form":
130
//								if (firstform) {
131
//									if (formtype != null) {
132
//										if(parser.getAttributeCount() > 0 
133
//											&& parser.getAttributeValue(0).equals(formtype)) // only one attribute in form, type
134
//											flagform = true;
135
//									}
136
//									else
137
									flagform = true;
138
									form = "";
139
									firstform = false;
140
								//}
141
								break;
142
							case "s": // TreeTagger can use s tags
143
								buffer.append("<s>\n");
144
								break;
145
						}
146
						break;
147
					case XMLStreamConstants.END_ELEMENT:
148
						localname = parser.getLocalName();
149
						switch (localname) {
150
							case "form":
151
								flagform = false;
152
								form = form.trim()
153
								if (form.length() == 0) buffer.append("__EMPTY__\n");
154
								else buffer.append(form.replace("\n", "").replace("<", "&lt;")+ "\n");
155
								//buffer.append(form+ "\n"); // its a txt file no need to use entities
156
								break;
157
							
158
							case "s":
159
								buffer.append("</s>\n");
160
								break;
161
						}
162
						break;
163
					
164
					case XMLStreamConstants.CHARACTERS:
165
						if (flagform) {
166
							if (parser.getText().length() > 0)
167
								form += parser.getText();
168
						}
169
						break;
170
				}
171
			}
172

  
173
			String str = buffer.toString()
174
			if ("false".equals(TreeTaggerPreferences.getString(TreeTaggerPreferences.FIX_APOSTROPHES, TreeTaggerPreferences.PREFERENCES_NODE))) {
175
				str = str.replace("’", "'").replace("‘", "'");
176
			}
177
			output.write(str)
178
			output.close();
179
			parser.close();
180
			inputData.close();
181
		} catch (Exception ex) {
182
			System.out.println(ex);
183
			return false;
184
		} 
185
		
186
		return true;
187
	}
188
	
189
	/**
190
	 * The main method.
191
	 *
192
	 * @param args the arguments
193
	 */
194
	public static void main(String[] args) {
195
		
196
		String rootDir = "~/xml/rgaqcj/";
197
		// new File(rootDir+"/identity/").mkdir();
198
		
199
		ArrayList<String> milestones = new ArrayList<String>();// the tags who
200
		// you want them
201
		// to stay
202
		// milestones
... Ce différentiel a été tronqué car il excède la taille maximale pouvant être affichée.

Formats disponibles : Unified diff