Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / TTrezToXml.groovy @ 479

History | View | Annotate | Download (5.9 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate: 2013-05-06 17:38:43 +0200 (lun. 06 mai 2013) $
25
// $LastChangedRevision: 2386 $
26
// $LastChangedBy: mdecorde $ 
27
//
28
package org.txm.scripts;
29

    
30
// TODO: Auto-generated Javadoc
31
/** The person. @deprecated @author mdecorde tranform Ttrez into a stand-off file */
32
class TTrezToXml {
33
        String person = ""; // used for respStmt/resp/name(person)
34
        
35
        /** The software. */
36
        String software = ""; // used for respStmt/resp/name(software)
37
        
38
        /** The cmd line. */
39
        String cmdLine = "";// used for respStmt/resp/name(software)/p
40
        
41
        /** The title. */
42
        String title = "";// used for <title>
43
        
44
        /** The types. */
45
        def types;// used for linkGrp type
46
        
47
        /** The idform. */
48
        String idform = "";
49
        
50
        /** The comments. */
51
        String comments = "...";// used for linkGrp type comments
52

    
53
        /** The writer. */
54
        Writer writer;
55

    
56
        /**
57
         * Process.
58
         *
59
         * @param TTrez the t trez
60
         * @param xmlform the xmlform
61
         * @param encoding the encoding
62
         */
63
        public void process(File TTrez,File xmlform, String encoding )
64
        {
65
                writer = new FileWriter(xmlform);
66
                writeHead();        
67
                writeBody(TTrez,encoding)
68
                writeTail();
69
                writer.close();
70
        }
71

    
72
        /**
73
         * Write head.
74
         */
75
        private void writeHead() {
76
                writer.write("<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>\n");
77
                writer.write("<!DOCTYPE TEI SYSTEM \"tei_bfm_v1.dtd\">\n");
78
                writer.write("<TEI xmlns:txm=\"http://textometrie.ens-lyon.fr/1.0\">\n");
79
                writer.write("<teiHeader xml:lang=\"eng\">\n");
80
                writer.write("<fileDesc>\n");
81
                writer.write("<titleStmt>\n");
82
                for (int i = 0; i < types.size(); i++)
83
                        writer.write("<title>" + types[i] + "</title>\n");
84
                writer.write("<respStmt>\n");
85
                writer.write("<resp>produced by</resp>\n");
86
                writer.write("<name type=\"person\">" + person + "</name>\n");
87
                writer.write("<name type=\"software\">" + software + "</name>\n");
88
                writer.write("<p>" + cmdLine + "</p>\n");
89
                writer.write("</respStmt>\n");
90
                writer.write("</titleStmt>\n");
91
                writer.write("<publicationStmt>\n");
92
                writer
93
                                .write("<distributor>BFM project - http://bfm.ens-lsh.fr</distributor>\n");
94
                writer.write("<availability>\n");
95
                writer.write("<p>(c) 2010 Projet BFM - CNRS/ENS-LSH.\n");
96
                writer.write("<hi>Conditions d'utilisation</hi> : \n");
97
                writer
98
                                .write("Sous licence <ref target=\"http://creativecommons.org/licenses/by-sa/2.0/fr/\">Creative Commons</ref>.\n");
99
                writer.write("</p>\n");
100
                writer.write("</availability>\n");
101
                writer.write("</publicationStmt>\n");
102
                writer.write("<sourceDesc>\n");
103
                writer
104
                                .write("<p>born digital : TXM project - http://textometrie.org</p>\n");
105
                writer.write("</sourceDesc>\n");
106
                writer.write("</fileDesc>\n");
107
                writer.write("</teiHeader>\n");
108
                writer.write("<text xml:lang=\"fr\" type=\"standoff\">\n");
109
                writer.write("<body>\n");
110
                writer.write("<div>\n");
111
        }
112

    
113
        /**
114
         * Write body.
115
         *
116
         * @param TTrez the t trez
117
         * @param encoding the encoding
118
         * @return the java.lang. object
119
         */
120
        private writeBody(File TTrez, String encoding)
121
        {
122
                
123
                String targets= "w_"+idform+"_";
124
                
125
                def content = TTrez.getText(encoding) 
126
                def separator= "\t"
127
                
128
                for(int i=0 ; i< types.size() ; i++)
129
                {
130
                        int id=1;
131
                        writer.write("<linkGrp type=\""+types[i]+"\">\n");
132
                        content.splitEachLine(separator) {fields ->
133
                                if(! ((String)fields[0]).substring( 0,1).matches("<"))
134
                                {
135
                                        writer.write("<link targets=\"#"+targets+id+" #"+ fields[i+1]+"\"/>\n");
136
                                        id++;
137
                                }
138
                        }
139
                        writer.write("</linkGrp>\n");
140
                }
141
        }
142

    
143
        /**
144
         * Write tail.
145
         */
146
        private void writeTail() {
147

    
148
                writer.write("</div>\n");
149
                writer.write(" </body>\n");
150
                writer.write("</text>\n");
151
                writer.write("</TEI>\n");
152
        }
153

    
154
        /**
155
         * Sets the annotation types.
156
         *
157
         * @param types the types
158
         * @param idform the idform
159
         */
160
        public void setAnnotationTypes(def types, String idform) {
161
                this.types = types;
162
                this.idform = idform;
163
        }
164

    
165
        /**
166
         * Sets the resp.
167
         *
168
         * @param person the person
169
         * @param software the software
170
         * @param cmdLine the cmd line
171
         */
172
        public void setResp(String person, String software, String cmdLine) {
173
                this.person = person;
174
                this.software = software;
175
                this.cmdLine = cmdLine;
176
        }
177

    
178
        /**
179
         * The main method.
180
         *
181
         * @param args the arguments
182
         */
183
        public static void main(String[] args)
184
        {
185
                String rootDir = "~/xml/rgaqcj/";
186
                new File(rootDir+"/pos/").mkdir();
187
                
188
                
189
                def ttfiles = ["roland-tt.txt","artu-tt.txt","qjm-tt.txt","commyn1-tt.txt","jehpar-tt.txt"];
190
                def posfiles = ["roland-pos.xml","artu-pos.xml","qjm-pos.xml","commyn1-pos.xml","jehpar-pos.xml"];
191
                
192
                String encoding ="ISO-8859-1";
193
                String person ="Serge Heiden"; 
194
                String software = "tree-tagger cmd line with options ..."; 
195
                String cmdLine = "tree-tagger -token ...";
196
                String title ="POS annotation";
197
                def types = ["POS","lemma"];
198
                String idform ="fro";
199
                
200
                for(int i=0; i < ttfiles.size();i++)
201
                {
202
                        File ttfile = new File(rootDir+"/tt/",ttfiles[i]);
203
                        File posfile = new File(rootDir+"/pos/",posfiles[i]);
204
                        println("Process file : "+ttfile+" to : "+posfile );
205
                        
206
                        def transfo = new TTrezToXml();
207
                        transfo.setAnnotationTypes( types, idform);
208
                        transfo.setResp( person, software, cmdLine);
209
                        transfo.process( ttfile, posfile, encoding );
210
                }        
211
        }
212
}