Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / fleurs / BuildEdition.groovy @ 1000

History | View | Annotate | Download (6.7 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate: 2017-01-24 18:11:42 +0100 (mar. 24 janv. 2017) $
25
// $LastChangedRevision: 3400 $
26
// $LastChangedBy: mdecorde $ 
27
//
28
package org.txm.scripts.importer.fleurs;
29

    
30
import org.txm.scripts.importer.*;
31
import org.xml.sax.Attributes;
32
import org.txm.importer.scripts.filters.*;
33
import java.io.FileOutputStream;
34
import java.io.IOException;
35
import java.io.OutputStreamWriter;
36
import java.util.ArrayList;
37
import javax.xml.parsers.SAXParserFactory;
38
import javax.xml.parsers.ParserConfigurationException;
39
import javax.xml.parsers.SAXParser;
40
import javax.xml.stream.*;
41
import java.net.URL;
42
import org.xml.sax.InputSource;
43
import org.xml.sax.helpers.DefaultHandler;
44

    
45
// TODO: Auto-generated Javadoc
46
/** Build Discours corpus simple edition from a xml-tei. @author mdecorde */
47
class BuildEdition {
48
        int wordcount;
49
        
50
        /** The No space before. */
51
        List<String> NoSpaceBefore;
52
        
53
        /** The No space after. */
54
        List<String> NoSpaceAfter;
55
        
56
        /** The wordid. */
57
        String wordid;
58
        
59
        /** The wordvalue. */
60
        String wordvalue;
61
        
62
        /** The interpvalue. */
63
        String interpvalue;
64
        
65
        /** The flaginterp. */
66
        boolean flaginterp = false;
67
        
68
        /** The lastword. */
69
        String lastword = " ";
70
        
71
        /** The wordtype. */
72
        String wordtype;
73
        
74
        /** The flagform. */
75
        boolean flagform = false;
76
        
77
        /** The url. */
78
        private def url;
79
        
80
        /** The input data. */
81
        private def inputData;
82
        
83
        /** The factory. */
84
        private def factory;
85
        
86
        /** The parser. */
87
        private XMLStreamReader parser;
88
        
89
        /** The writer. */
90
        def writer;
91

    
92
        /**
93
         * Instantiates a new builds the edition.
94
         *
95
         * @param infile the infile
96
         * @param outfile the outfile
97
         * @param NoSpaceBefore the no space before
98
         * @param NoSpaceAfter the no space after
99
         */
100
        BuildEdition(File infile, File outfile, List<String> NoSpaceBefore,
101
                        List<String> NoSpaceAfter) {
102
                this.NoSpaceBefore = NoSpaceBefore;
103
                this.NoSpaceAfter = NoSpaceAfter;
104
                this.url = infile.toURI().toURL();
105

    
106
                inputData = url.openStream();
107
                factory = XMLInputFactory.newInstance();
108
                parser = factory.createXMLStreamReader(inputData);
109
                createOutput(outfile);
110
                process();
111
        }
112

    
113
        /**
114
         * Creates the output.
115
         *
116
         * @param outfile the outfile
117
         * @return true, if successful
118
         */
119
        private boolean createOutput(File outfile) {
120
                try {
121
                        writer = new OutputStreamWriter(new FileOutputStream(outfile),
122
                                        "UTF-8");
123

    
124
                        return true;
125
                } catch (Exception e) {
126
                        System.out.println(e.getLocalizedMessage());
127
                        return false;
128
                }
129
        }
130

    
131
        /**
132
         * Process.
133
         */
134
        void process()
135
        {
136
                String localname = "";
137
                
138
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
139
                        
140
                        
141
                        switch (event) {
142
                                case XMLStreamConstants.START_ELEMENT:
143
                                        localname = parser.getLocalName();
144
                                        switch(localname)
145
                                        {
146
                                                case "teiHeader":
147
                                                if(parser.getAttributeValue(null,"type").equals("text"))
148
                                                {
149
                                                        writer.write("<h1 class=\"text\">Discours</h1>")
150
                                                }
151
                                                else if(parser.getAttributeValue(null,"type").equals("corpus"))
152
                                                {
153
                                                        writer.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
154
                                                        writer.write("<html>");
155
                                                        writer.write("<head>");
156
                                                        writer.write("<title>Discours edition</title>");
157
                                                        writer.write("</head>");
158
                                                        writer.write("<body>");
159
                                                }
160
                                                break;
161
                                                
162
                                                case "text":
163
                                                        writer.write("<ul>")
164
                                                        writer.write("<li>type : "+parser.getAttributeValue(null,"type")+"</li>")
165
                                                        writer.write("<li>locuteur : "+parser.getAttributeValue(null,"loc")+"</li>")
166
                                                        writer.write("<li>date : "+parser.getAttributeValue(null,"date")+"</li>")
167
                                                        writer.write("</ul>")
168
                                                
169
                                                break;
170
                                                case "p":
171
                                                        writer.write("<p>");
172
                                                break;
173
                                                
174
                                                case "w":
175
                                                wordid=parser.getAttributeValue(null,"id");
176
                                                wordcount++;
177
                                                break;
178
                                                
179
                                                case "ana":
180
                                                flaginterp=true;
181
                                                interpvalue+=" "+parser.getAttributeValue(null,"type")+":"
182
                                                break;
183
                                                
184
                                                case "form":
185
                                                wordvalue="";
186
                                                interpvalue ="";
187
                                                flagform=true;
188
                                        }
189
                                break;
190
                                case XMLStreamConstants.END_ELEMENT:
191
                                        localname = parser.getLocalName();
192
                                        switch(localname)
193
                                        {
194
                                                case "teiHeader":
195

    
196
                                                break;
197
                                                
198
                                                case "p":
199
                                                        writer.write("</p>");
200
                                                break;
201
                                                
202
                                                case "form":
203
                                                flagform = false
204
                                                
205
                                                break;
206
                                                
207
                                                case "ana":
208
                                                flaginterp = false
209
                                                break;
210
                                                
211
                                                case "w":
212
                                                int l = lastword.length();
213
                                                interpvalue = interpvalue.replace("\"","&quot;");
214
                                                if(NoSpaceBefore.contains(wordvalue))
215
                                                writer.write("<span title=\""+interpvalue+"\" id=\""+wordid+"\">");
216
                                                else if(NoSpaceAfter.contains(lastword.substring(l-1, l)))
217
                                                writer.write("<span title=\""+interpvalue+"\" id=\""+wordid+"\">");
218
                                                else if(wordvalue.startsWith("-") && wordvalue.length() > 0)
219
                                                writer.write("<span title=\""+interpvalue+"\" id=\""+wordid+"\">");
220
                                                else
221
                                                writer.write(" <span title=\""+interpvalue+"\" id=\""+wordid+"\">");
222
                                                writer.write(wordvalue);
223
                                                writer.write("</span>");
224
                                                lastword=wordvalue;
225
                                                break;
226
                                        }
227
                                break;
228
                                
229
                                case XMLStreamConstants.CHARACTERS:
230
                                        if(flagform)
231
                                                if(parser.getText().length() > 0)
232
                                                        wordvalue+=(parser.getText());
233
                                        if(flaginterp)
234
                                                if(parser.getText().length() > 0)
235
                                                        interpvalue+=(parser.getText());
236
                                break;
237
                        }
238
                }        
239
                writer.write("</body>");
240
                writer.write("</html>");
241
                inputData.close();
242
        }
243

    
244
        /**
245
         * The main method.
246
         *
247
         * @param args the arguments
248
         */
249
        public static void main(String[] args)
250
        {
251
                String rootDir = "~/.txm/cwb/data/discours/src/";
252
                new File(rootDir+"/identity/").mkdir();
253
                
254
                File srcfile = new File(rootDir,"discours-p5.xml");
255
                File resultfile = new File(rootDir,"discours-p5.html");
256
                List<String> NoSpaceBefore = [",",".",")","]","}"];
257
                List<String> NoSpaceAfter = ["'","(","[","{"];
258
                println("build discours xml-tei file : "+srcfile+" to : "+resultfile );
259
                
260
                def builder = new BuildEdition(srcfile,resultfile, NoSpaceBefore, NoSpaceAfter);
261
                
262
                return;
263
        }
264
}