Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / fleurs / BuildTEI.groovy @ 479

History | View | Annotate | Download (6.8 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate: 2017-01-24 18:11:42 +0100 (mar. 24 janv. 2017) $
25
// $LastChangedRevision: 3400 $
26
// $LastChangedBy: mdecorde $ 
27
//
28
package org.txm.importer.fleurs;
29

    
30
import javax.xml.stream.*;
31
import java.io.FileOutputStream;
32
import java.io.OutputStreamWriter;
33
import java.net.URL;
34

    
35
// TODO: Auto-generated Javadoc
36
/**
37
 * The Class BuildTEI.
38
 */
39
class BuildTEI {
40
        
41
        /** The url. */
42
        private def url;
43
        
44
        /** The input data. */
45
        private def inputData;
46
        
47
        /** The factory. */
48
        private def factory;
49
        
50
        /** The parser. */
51
        private XMLStreamReader parser;
52
        
53
        /** The output. */
54
        private OutputStreamWriter output;
55

    
56
        /**
57
         * Instantiates a new builds the tei.
58
         *
59
         * @param discoursxml the discoursxml
60
         */
61
        public BuildTEI(File discoursxml) {
62
                this.url = discoursxml.toURI().toURL();
63

    
64
                inputData = url.openStream();
65
                factory = XMLInputFactory.newInstance();
66
                parser = factory.createXMLStreamReader(inputData);
67
        }
68

    
69
        /**
70
         * Creates the output.
71
         *
72
         * @param outfile the outfile
73
         * @return true, if successful
74
         */
75
        private boolean createOutput(File outfile) {
76
                try {
77
                        output = new OutputStreamWriter(new FileOutputStream(outfile),
78
                                        "UTF-8");
79
                        return true;
80
                } catch (Exception e) {
81
                        System.out.println(e.getLocalizedMessage());
82
                        return false;
83
                }
84
        }
85

    
86
        /**
87
         * Write header.
88
         */
89
        public void writeHeader() {
90
                output.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
91
                output
92
                                .write("<teiCorpus xmlns=\"http://www.tei-c.org/ns/1.0\" xmlns:txm=\"http://textometrie.org/1.0\">\n");
93
                output.write("<teiHeader type=\"corpus\">\n");
94
                output.write("</teiHeader>\n");
95
        }
96

    
97
        /**
98
         * Write footer.
99
         */
100
        public void writeFooter() {
101
                output.write("</teiCorpus>\n");
102
        }
103

    
104
        /**
105
         * Process.
106
         *
107
         * @param outfile the outfile
108
         */
109
        public void process(File outfile)
110
        {
111
                createOutput(outfile);
112
                int sentence=0;
113
                int paragraph=0;
114
                int idword=0;
115
                
116
                String lastopenlocalname= "";
117
                String localname = "";
118
                                
119
                try {
120
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
121
                                
122
                                switch (event) {
123
                                        case XMLStreamConstants.START_ELEMENT:
124
                                                localname = parser.getLocalName();
125
                                        lastopenlocalname = localname;
126
                                        switch (localname) {
127
                                                case "corpus":
128
                                                        this.writeHeader();
129
                                                break;
130
                                                case "w":
131
                                                        idword++;        
132
                                                        
133
                                                        int para = Integer.parseInt(parser.getAttributeValue(null,"para"));
134
                                                        int sent = Integer.parseInt(parser.getAttributeValue(null,"sent"));
135
                                                        if(sent != sentence)
136
                                                        {
137
                                                                output.write("</s>\n")
138
                                                                
139
                                                                if(para != paragraph)
140
                                                                {
141
                                                                        output.write("</p>\n")
142
                                                                        output.write("<p id=\"p_"+para+"\">\n");
143
                                                                        paragraph = para;
144
                                                                }
145
                                                                
146
                                                                output.write("<s id=\"s_"+sent+"\">\n");
147
                                                                sentence = sent;
148
                                                        }
149

    
150
                                                        output.write("<w id=\"w_"+idword+"\">\n");
151
                                                        output.write(" <txm:form>"+parser.getAttributeValue(null,"form")+"</txm:form>\n");
152
                                                        output.write(" <txm:ana resp=\"#cordial\" type=\"#pos\">"+parser.getAttributeValue(null,"pos")+"</interp>\n");
153
                                                        output.write(" <txm:ana resp=\"#cordial\" type=\"#func\">"+parser.getAttributeValue(null,"func")+"</interp>\n");
154
                                                        output.write(" <txm:ana resp=\"#cordial\" type=\"#lemma\">"+parser.getAttributeValue(null,"lem")+"</interp>\n");
155
                                                        output.write("</w>\n");
156
                                                        
157
                                                break;
158
                                                case "text":
159
                                                        output.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
160
                                                        output.write("<TEI xmlns=\"http://www.tei-c.org/ns/1.0\" xmlns:txm=\"http://textometrie.org/1.0\">\n");
161
                                                        output.write("<teiHeader type=\"text\">\n");
162
                                                        output.write("<fileDesc>\n");
163
                                                        output.write("<titleStmt>\n");
164
                                                        output.write("<title>"+parser.getAttributeValue(null,"file")+"</title>\n");
165
                                                        output.write("<respStmt>\n");
166
                                                        output.write("<resp id=\"cordial\">initial tagging</resp>")
167
                                                        output.write("</respStmt>\n");
168
                                                        output.write("</titleStmt>\n");
169
                                                        output.write("</fileDesc>\n");
170
                                                        output.write("<encodingDesc>\n");
171
                                                        output.write("<classDecl>\n");
172
                                                        output.write("<taxonomy id=\"pos\"><bibl type=\"tagset\"/></taxonomy>\n")
173
                                                        output.write("<taxonomy id=\"func\"><bibl type=\"tagset\"/></taxonomy>\n")
174
                                                        output.write("<taxonomy id=\"lemma\"><bibl type=\"tagset\"/></taxonomy>\n")
175
                                                        output.write("</classDecl>\n");
176
                                                        output.write("</encodingDesc>\n");
177
                                                        output.write("</teiHeader>\n");
178
                                                        output.write("<text id=\""+parser.getAttributeValue(null,"id")+"\" type=\""+parser.getAttributeValue(null,"type")+"\" loc=\""+parser.getAttributeValue(null,"loc")+"\" date=\""+parser.getAttributeValue(null,"date")+"\" file=\""+parser.getAttributeValue(null,"file")+"\">\n");
179
                                                        output.write("<p id=\"p_1\">\n");
180
                                                        output.write("<s id=\"s_1\">\n");
181
                                                        paragraph=1;
182
                                                        sentence=1;
183
                                                break;
184
                                        }
185
                                        break;        
186
                                        
187
                                        case XMLStreamConstants.END_ELEMENT:
188
                                                localname = parser.getLocalName();
189
                                        switch (localname) {                                
190
                                                
191
                                                case "w":
192
                                                break;
193
                                                
194
                                                case "text":
195
                                                        output.write("</s>\n");
196
                                                        output.write("</p>\n");
197
                                                        paragraph=1;
198
                                                        sentence=1;
199
                                                        output.write("</text>\n");
200
                                                        output.write("</TEI>\n");
201
                                                break;
202
                                                
203
                                                case "corpus":
204
                                                        this.writeFooter();
205
                                                break;
206
                                        }
207
                                        break;
208
                                        
209
                                        case XMLStreamConstants.CHARACTERS:
210
                                                //output.write(parser.getText().trim());                                
211
                                        break;
212
                                }
213
                        }
214
                        output.close();
215
                        parser.close();
216
                        inputData.close();
217
                }
218
                catch (XMLStreamException ex) {
219
                        System.out.println(ex);
220
                }
221
                catch (IOException ex) {
222
                        System.out.println("IOException while parsing " + inputData);
223
                }
224
        }
225

    
226
        /**
227
         * The main method.
228
         *
229
         * @param args the arguments
230
         */
231
        public static void main(String[] args) {
232

    
233
                String rootDir = "~/xml/discours/src";
234

    
235
                File srcfile = new File(rootDir, "discours.xml");
236
                File resultfile = new File(rootDir, "discours-tei.xml");
237
                println("build discours xml-tei file : " + srcfile + " to : "
238
                                + resultfile);
239

    
240
                def builder = new BuildTEI(srcfile);
241
                builder.process(resultfile);
242

    
243
                return;
244
        }
245
}