Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / test / CQPBenchmark.groovy @ 544

History | View | Annotate | Download (13.7 kB)

1
package org.txm.test
2

    
3

    
4
/**
5
 * Main.
6
 *
7
 * @param args the args
8
 */
9
// Copyright © 2010-2013 ENS de Lyon.
10
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
11
// Lyon 2, University of Franche-Comté, University of Nice
12
// Sophia Antipolis, University of Paris 3.
13
// 
14
// The TXM platform is free software: you can redistribute it
15
// and/or modify it under the terms of the GNU General Public
16
// License as published by the Free Software Foundation,
17
// either version 2 of the License, or (at your option) any
18
// later version.
19
// 
20
// The TXM platform is distributed in the hope that it will be
21
// useful, but WITHOUT ANY WARRANTY; without even the implied
22
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
23
// PURPOSE. See the GNU General Public License for more
24
// details.
25
// 
26
// You should have received a copy of the GNU General
27
// Public License along with the TXM platform. If not, see
28
// http://www.gnu.org/licenses.
29
//
30
//
31
//
32
// $LastChangedDate: 2011-11-25 11:30:11 +0100 (ven., 25 nov. 2011) $
33
// $LastChangedRevision: 2069 $
34
// $LastChangedBy: mdecorde $
35
//
36

    
37

    
38
import java.util.ArrayList;
39

    
40
import org.txm.utils.DeleteDir;
41
import org.txm.*;
42
import org.txm.lexicaltable.core.functions.LexicalTable
43
import org.txm.objects.Base;
44
import org.txm.ca.core.functions.CA
45
import org.txm.cah.core.functions.AHC
46
import org.txm.concordance.core.functions.Concordance
47
import org.txm.cooccurrence.core.functions.Cooccurrence
48
import org.txm.functions.*;
49
import org.txm.functions.diagnostic.*;
50
import org.txm.functions.ca.*;
51
import org.txm.functions.index.*;
52
import org.txm.functions.referencer.Referencer;
53
import org.txm.functions.specificities.*;
54
import org.txm.functions.concordances.*;
55
import org.txm.functions.concordances.comparators.*;
56
import org.txm.searchengine.cqp.ReferencePattern
57
import org.txm.searchengine.cqp.corpus.*;
58
import org.txm.searchengine.cqp.corpus.query.*;
59

    
60

    
61
testDir = new File(System.getProperty("user.home"), "TXM/testrelease");
62
DeleteDir.deleteDirectory testDir;
63
testDir.mkdir()
64

    
65
//get corpus
66
csv = [];
67
firstExecution = 0;
68

    
69
def process(String CORPUSNAME, String QUERY1, String QUERY2) {
70
        
71
        String ENCODING = "UTF-8"
72
        String COLSEPARATOR = "\t"
73
        String TXTSEPARATOR = ""
74
        int i = 0;
75
        def corpora = CorpusManager.getCorpusManager().getCorpora()
76
        MainCorpus corpus = CorpusManager.getCorpusManager().getCorpus(CORPUSNAME)
77
        //println "CORPUS: "+corpus
78

    
79
        Query query = new Query(QUERY1) //"\"..............*\"", "\"..........*\""
80
        Query query2 = new Query(QUERY2)
81

    
82
        File exporttestdir = new File(testDir, corpus.getName());
83
        File reportFile = new File(exporttestdir, "report.csv")
84
        DeleteDir.deleteDirectory exporttestdir;
85
        exporttestdir.mkdir()
86
        //println "Results are saved in dir: "+ exporttestdir
87

    
88
        // word properties
89
        def word_property = corpus.getProperty("word")
90

    
91
        // structure properties
92
        StructuralUnit text_su = corpus.getStructuralUnit("text")
93
        StructuralUnit s_su = corpus.getStructuralUnit("s")
94
        Property text_id_property = text_su.getProperty("id")
95
        ReferencePattern referencePattern = new ReferencePattern().addProperty(text_id_property)
96

    
97
        long time;
98
        // START START START START
99
        if (firstExecution == 0)
100
                csv << ["object", "size", "nPart", "command", "query", "query freq", "$CORPUSNAME $QUERY1 $QUERY2 mode "+Toolbox.getParam(Toolbox.CQI_NETWORK_MODE)]
101
        else
102
                csv[i++] << "$CORPUSNAME $QUERY1 $QUERY2 mode "+Toolbox.getParam(Toolbox.CQI_NETWORK_MODE)
103
        
104
        // INFORMATIONS
105
        print " INFO"
106
        time = System.currentTimeMillis();
107
        Diagnostic diag = new Diagnostic(corpus, 20)
108
        diag.stepGeneralInfos();
109
        diag.stepLexicalProperties();
110
        diag.stepStructuralUnits();
111
        diag.toHTML(new File(exporttestdir, "diag"))
112
        if (firstExecution == 0)
113
                csv << [corpus.getName(), corpus.getSize(), 1, "Informations", "no query", "no freq", (System.currentTimeMillis()-time)/1000]
114
        else
115
                csv[i++] << (System.currentTimeMillis()-time)/1000
116

    
117
        // LEXICON
118
        print " LEX"
119
        time = System.currentTimeMillis();
120
        corpus.getLexicon(word_property).toTxt(new File(exporttestdir, "lexpos"), ENCODING, COLSEPARATOR, TXTSEPARATOR);
121
        if (firstExecution == 0)
122
                csv << [corpus.getName(), corpus.getSize(), 1, "Lexicon", "no query", "no freq", (System.currentTimeMillis()-time)/1000]
123
        else
124
                csv[i++] << (System.currentTimeMillis()-time)/1000
125
                
126
        // INDEX
127
        print " INDEX"
128
        time = System.currentTimeMillis();
129
        IndexSample index = new IndexSample(corpus, query, [word_property])
130
        index.toTxt(new File(exporttestdir, "indexlemmafuncj"), ENCODING, COLSEPARATOR, TXTSEPARATOR)
131
        if (firstExecution == 0)
132
                csv << [corpus.getName(), corpus.getSize(), 1, "Index", query, index.getT(), (System.currentTimeMillis()-time)/1000]
133
        else
134
                csv[i++] << (System.currentTimeMillis()-time)/1000
135

    
136
        // REFERENCER
137
        print " REF"
138
        time = System.currentTimeMillis();
139
        Referencer referencer = new Referencer(corpus, query, word_property, [text_id_property], true);
140
        referencer.getQueryMatches()
141
        referencer.getQueryindexes()
142
        referencer.groupPositionsbyId()
143
        referencer.toTxt(new File(exporttestdir, "referencer"), ENCODING)
144
        if (firstExecution == 0)
145
                csv << [corpus.getName(), corpus.getSize(), 1, "Referencer", query, index.getT(), (System.currentTimeMillis()-time)/1000]
146
        else
147
                csv[i++] << (System.currentTimeMillis()-time)/1000
148

    
149
        // CONCORDANCE
150
        print " CONC"
151
        time = System.currentTimeMillis();
152
        Concordance concordance = new Concordance(corpus, query, word_property, [word_property, word_property], referencePattern, referencePattern, 15, 15)
153
        concordance.toTxt(new File(exporttestdir,"concj"), Concordance.Format.CONCORDANCE)
154
        if (firstExecution == 0)
155
                csv << [corpus.getName(), corpus.getSize(), 1, "Concordances", query, index.getT(), (System.currentTimeMillis()-time)/1000]
156
        else
157
                csv[i++] << (System.currentTimeMillis()-time)/1000
158

    
159
        // COOCCURRENCE WORD WINDOW
160
        print " COOC"
161
        time = System.currentTimeMillis();
162
        Cooccurrence cooc = new Cooccurrence(corpus, query, [word_property], null, 21, 1, 1, 11, 2, 3, 1, false);
163
        cooc.process();
164
        cooc.toTxt(new File(exporttestdir, "cooc_wordwindow"), ENCODING)
165
        if (firstExecution == 0)
166
                csv << [corpus.getName(), corpus.getSize(), 1, "Cooccurrences words", query, index.getT(), (System.currentTimeMillis()-time)/1000]
167
        else
168
                csv[i++] << (System.currentTimeMillis()-time)/1000
169

    
170
        // COOCCURRENCE SENTENCE WINDOW
171
        print " COOC"
172
        time = System.currentTimeMillis();
173
        Cooccurrence cooc2 = new Cooccurrence(corpus, query, [word_property], s_su,2, 1, 1, 1, 2, 3,1, false);
174
        cooc2.process();
175
        cooc2.toTxt(new File(exporttestdir, "cooc_swindow"), ENCODING)
176
        if (firstExecution == 0)
177
                csv << [corpus.getName(), corpus.getSize(), 1, "Cooccurrences structures", query, index.getT(), (System.currentTimeMillis()-time)/1000]
178
        else
179
                csv[i++] << (System.currentTimeMillis()-time)/1000
180

    
181
        // INDEX
182
        print " INDEX"
183
        time = System.currentTimeMillis();
184
        index = new IndexSample(corpus, query2, [word_property])
185
        index.toTxt(new File(exporttestdir, "indexlemmafuncj"), ENCODING, COLSEPARATOR, TXTSEPARATOR)
186
        if (firstExecution == 0)
187
                csv << [corpus.getName(), corpus.getSize(), 1, "Index", query2, index.getT(), (System.currentTimeMillis()-time)/1000]
188
        else
189
                csv[i++] << (System.currentTimeMillis()-time)/1000
190

    
191
        // REFERENCER
192
        print " REF"
193
        time = System.currentTimeMillis();
194
        referencer = new Referencer(corpus, query2, word_property, [text_id_property], true);
195
        referencer.getQueryMatches()
196
        referencer.getQueryindexes()
197
        referencer.groupPositionsbyId()
198
        referencer.toTxt(new File(exporttestdir, "referencer"), ENCODING)
199
        if (firstExecution == 0)
200
                csv << [corpus.getName(), corpus.getSize(), 1, "Referencer", query, index.getT(), (System.currentTimeMillis()-time)/1000]
201
        else
202
                csv[i++] << (System.currentTimeMillis()-time)/1000
203

    
204
        // CONCORDANCE
205
        print " CONC"
206
        time = System.currentTimeMillis();
207
        concordance = new Concordance(corpus, query2, word_property, [word_property, word_property], referencePattern, referencePattern, 15, 15)
208
        concordance.toTxt(new File(exporttestdir,"concj"), Concordance.Format.CONCORDANCE)
209
        if (firstExecution == 0)
210
                csv << [corpus.getName(), corpus.getSize(), 1, "Concordances", query, index.getT(), (System.currentTimeMillis()-time)/1000]
211
        else
212
                csv[i++] << (System.currentTimeMillis()-time)/1000
213

    
214
        // COOCCURRENCE WORD WINDOW
215
        print " COOC"
216
        time = System.currentTimeMillis();
217
        cooc = new Cooccurrence(corpus, query2, [word_property], null, 21, 1, 1, 11, 2, 3, 1, false);
218
        cooc.process();
219
        cooc.toTxt(new File(exporttestdir, "cooc_wordwindow"), ENCODING)
220
        if (firstExecution == 0)
221
                csv << [corpus.getName(), corpus.getSize(), 1, "Cooccurrences words", query, index.getT(), (System.currentTimeMillis()-time)/1000]
222
        else
223
                csv[i++] << (System.currentTimeMillis()-time)/1000
224

    
225
        // COOCCURRENCE SENTENCE WINDOW
226
        print " COOC"
227
        time = System.currentTimeMillis();
228
        cooc2 = new Cooccurrence(corpus, query2, [word_property], s_su,2, 1, 1, 1, 2, 3,1, false);
229
        cooc2.process();
230
        cooc2.toTxt(new File(exporttestdir, "cooc_swindow"), ENCODING)
231
        if (firstExecution == 0)
232
                csv << [corpus.getName(), corpus.getSize(), 1, "Cooccurrences structures", query, index.getT(), (System.currentTimeMillis()-time)/1000]
233
        else
234
                csv[i++] << (System.currentTimeMillis()-time)/1000
235

    
236
        // SUBCORPORA
237
        print " SUBCORPUS"
238
        time = System.currentTimeMillis();
239
        Corpus DGcorpus = corpus.createSubcorpus(text_su, text_id_property, "01_DeGaulle", "dgsubcorpus")
240
        if (firstExecution == 0)
241
                csv << [corpus.getName(), corpus.getSize(), 1, "Subcorpus", "no query", "no freqs", (System.currentTimeMillis()-time)/1000]
242
        else
243
                csv[i++] << (System.currentTimeMillis()-time)/1000
244

    
245
        // PARTITIONS
246
        print " PARTITIONS"
247
        time = System.currentTimeMillis();
248
        Partition discours_types = corpus.createPartition(text_su, text_id_property)
249
        Partition discours_dates = corpus.createPartition(text_su, text_id_property)
250
        if (firstExecution == 0)
251
                csv << [corpus.getName(), corpus.getSize(), 1, "Partition 2x", "no query", "no freqs", (System.currentTimeMillis()-time)/1000]
252
        else
253
                csv[i++] << (System.currentTimeMillis()-time)/1000
254

    
255
        // LEXICAL TABLE
256
        print " LT"
257
        time = System.currentTimeMillis();
258
        LexicalTable table = discours_types.getLexicalTable(word_property, 2);
259
        table.exportData(new File(exporttestdir, "type_LT"), COLSEPARATOR, TXTSEPARATOR);
260
        if (firstExecution == 0)
261
                csv << [discours_types.getName(), corpus.getSize(), discours_types.getNPart(), "LT part", "no query", "no freqs", (System.currentTimeMillis()-time)/1000]
262
        else
263
                csv[i++] << (System.currentTimeMillis()-time)/1000
264

    
265
        // SPECIF SUBCORPUS
266
//        time = System.currentTimeMillis();
267
//        print " SPECIF"
268
//        SpecificitesResult specifresult2 = org.txm.functions.specificities.Specificites.specificites(DGcorpus.getParent(), DGcorpus, word_property)
269
//        specifresult2.toTxt(new File(exporttestdir,"dgsub_specifloc"), ENCODING, COLSEPARATOR, TXTSEPARATOR)
270
//        if (firstExecution == 0)
271
//                csv << [DGcorpus.getName(), corpus.getSize(), 1, "Specif sub", "no query", "no freqs", (System.currentTimeMillis()-time)/1000]
272
//        else
273
//                csv[i++] << (System.currentTimeMillis()-time)/1000
274
//
275
//        // SPECIF LEXICAL TABLE
276
//        print " SPECIF"
277
//        time = System.currentTimeMillis();
278
//        SpecificitesResult specifresult3 = org.txm.functions.specificities.Specificites.specificites(table);
279
//        specifresult3.toTxt(new File(exporttestdir,"speciftype"), ENCODING, COLSEPARATOR, TXTSEPARATOR)
280
//        if (firstExecution == 0)
281
//                csv << [table.getName(), corpus.getSize(), table.getNColumns(), "specif LT", "no query", "no freqs", (System.currentTimeMillis()-time)/1000]
282
//        else
283
//                csv[i++] << (System.currentTimeMillis()-time)/1000
284

    
285
        // AFC PARTITION
286
        print " AFC"
287
        time = System.currentTimeMillis();
288
        CA ca = new CA(discours_dates, word_property, 0 ,9999999)
289
        ca.stepLexicalTable();
290
        ca.stepSortTableLexical();
291
        ca.compute()
292
        ca.toSVGFactorialMap(new File(exporttestdir,"cadates"), true, true)
293
        ca.toSVGSingularValues(new File(exporttestdir,"cadates_singularvalues"))
294
        if (firstExecution == 0)
295
                csv << [discours_dates.getName(), corpus.getSize(), discours_dates.getNPart(), "AFC part", "no query", "no freqs", (System.currentTimeMillis()-time)/1000]
296
        else
297
                csv[i++] << (System.currentTimeMillis()-time)/1000
298

    
299
        // AFC LEXICAL TABLE
300
        print " AFC"
301
        time = System.currentTimeMillis();
302
        CA ca2 = new CA(table);
303
        ca2.compute()
304
        ca2.toSVGFactorialMap(new File(exporttestdir,"cadates"), true, true)
305
        ca2.toSVGSingularValues(new File(exporttestdir,"cadates_singularvalues"))
306
        if (firstExecution == 0)
307
                csv << [table.getName(), corpus.getSize(), table.getNColumns(), "specif LT", "no query", "no freqs", (System.currentTimeMillis()-time)/1000]
308
        else
309
                csv[i++] << (System.currentTimeMillis()-time)/1000
310

    
311
        // CAH
312
        print " CAH"
313
        time = System.currentTimeMillis();
314
        AHC cah = new AHC(ca, true, AHC.getDefaultMethod(), AHC.getDefaultMetric(), 4, false)
315
        cah.stepCompute()
316
        cah.toSVG(new File(exporttestdir, "dates_cah"),RDevice.SVG);
317
        if (firstExecution == 0)
318
                csv << [discours_dates.getName(), corpus.getSize(), discours_dates.getNPart(), "CAH ca table", "no query", "no freqs", (System.currentTimeMillis()-time)/1000]
319
        else
320
                csv[i++] << (System.currentTimeMillis()-time)/1000
321

    
322
        firstExecution++;
323
        println ""
324

    
325
}
326

    
327

    
328

    
329
println "restarting TBX..."
330
Toolbox.setParam(Toolbox.CQI_NETWORK_MODE, "false")
331
println "MEMORY MODE: "+Toolbox.restart();
332
process("DISCOURS", "\"..............*\"", "\"..........*\"") // first time
333
println "MEMORY MODE: "+Toolbox.restart();
334
process("DISCOURS", "\"..............*\"", "\"..........*\"")
335

    
336
println "restarting TBX..."
337
Toolbox.setParam(Toolbox.CQI_NETWORK_MODE, "true")
338
println "NETWORK MODE: "+Toolbox.restart();
339
process("DISCOURS", "\"..............*\"", "\"..........*\"")
340
println "NETWORK MODE: "+Toolbox.restart();
341
process("DISCOURS", "\"..............*\"", "\"..........*\"")
342

    
343

    
344

    
345

    
346
// add TOTAL line
347
int[] totaux = ["","","","","","",""]
348
for (int i = 7 ; i < csv.size() ; i++) {
349
        int total = 0;
350
        for (def line : csv) {
351
                total += line[i]
352
        }
353
        totaux << total
354
}
355
csv << totaux;
356

    
357
// WRITE ALL RESULTS
358
for (def line : csv) {
359
        for (int i = 0 ; i < firstExecution ; i++) {
360
                def item = line[i]
361
                if (i > 0) print "\t"+item
362
                else print item
363
        }
364
        println ""
365
}