Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / BuildAfrLexicon.groovy @ 479

History | View | Annotate | Download (4.7 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate: 2013-05-06 17:38:43 +0200 (lun. 06 mai 2013) $
25
// $LastChangedRevision: 2386 $
26
// $LastChangedBy: mdecorde $ 
27
//
28
package org.txm.scripts;
29

    
30
import groovy.lang.GroovyClassLoader
31

    
32
import java.io.File
33

    
34
import org.txm.utils.ProcessBuilderBuilder
35
// TODO: Auto-generated Javadoc
36

    
37
/**
38
 * build an afr lexicon in a SQL table.
39
 *
40
 * @author mdecorde
41
 */
42
class BuildAfrLexicon 
43
{
44
        
45
        /**
46
         * The main method.
47
         *
48
         * @param args the arguments
49
         */
50
        static void main(String[] args)
51
        {
52
                HSQLFunctions.clearAll();
53
                
54
                //get lexicon from TXM RGAQCJ
55
                ProcessBuilderBuilder.build(new File("src/groovy/org/txm/scripts/cwb-decode-wrapper-definition.xml"), new File("src/groovy/org/textometrie/scripts/CwbDecode.groovy"));
56

    
57
                GroovyClassLoader gcl = new GroovyClassLoader();
58
                gcl.addClasspath(".");
59

    
60
                String registryPath = "~/Bureau/trunkCWB/corpora/registry";
61
                //String registryPath = "C:/Documents and Settings/sheiden/.txm/registry"
62
                 String cwbdecodeexecDir = "~/Bureau/trunkCWB/cwb-3.0/utils/"
63
                //String cwbdecodeexecDir = "C:/Projets/Textom�trie/Logiciel/Toolbox/0.4.5/toolbox/src/main/C/cwb-3.0/utils/"
64

    
65
                Class clazz = gcl.parseClass(new File("src/groovy/org/txm/scripts/CwbDecode.groovy"));
66
                def aScript = clazz.newInstance(cwbdecodeexecDir);
67

    
68
                aScript.setC()
69
                def pAttributes = ["word","CATTEX2009","LEMMA"];
70
                def sAttributes = ["s"];
71
                aScript.setP(pAttributes)
72
                //aScript.setS(sAttributes)
73
                aScript.setr(registryPath)
74
                
75
String rootdir = "~/xml/rgaqcj/lexicon";
76
File lexRGAQCJFile = new File(rootdir, "lexRGAQCJ.txt");
77
FileOutputStream fos = new FileOutputStream(lexRGAQCJFile);
78
PrintStream ps = new PrintStream(fos);
79
def out = System.out;
80
System.setOut(ps);
81

    
82
                if(System.getProperty("os.name").contains("Windows"))
83
                        aScript.cwbdecodeexe("RGAQCJ")
84
                else
85
                        aScript.cwbdecode("RGAQCJ")
86
System.setOut(out);        
87
println("fin")
88
                //Load lexRGAQCJ.txt >> lexRGAQCJ(form,cat,val)
89
                String[] argsname = ["form","cat","val"];
90
                String[] types = ["VARCHAR(30)","VARCHAR(30)","VARCHAR(100)"];
91

    
92
                HSQLFunctions.ImportRefTable("lexRGAQCJ", argsname,types,lexRGAQCJFile,"\t","ISO-8859-1");
93
                
94
                //load afrlex.txt >> afrlex(form,cat,val)
95
                File afrlexFile = new File(rootdir, "afrlex.txt");
96
                HSQLFunctions.ImportRefTable("afrlex", argsname,types,afrlexFile,"\t","ISO-8859-1")
97

    
98
                //Load TTcna_2_CTX9.txt >> TTcna_2_CTX9(catcna,catctx9)
99
                File correspFile = new File(rootdir, "TTnca_2_CTX9.txt");
100
                argsname = ["catcna","catctx9"];
101
                types = ["VARCHAR(20)","VARCHAR(20)"];
102
                HSQLFunctions.ImportCSVTable("TTnca_2_CTX9", argsname,types,correspFile,"\t","ISO-8859-1")
103

    
104
                //Union(lexRGAQCJ, afrlex) >> lextotal(form,cat,val)
105
                //HSQLFunctions.CreateTable( "lextotal", argsname, types);
106
                String query =         "INSERT INTO afrlex " +
107
                                                "SELECT * from lexRGAQCJ "+
108
                                                "WHERE " +
109
                                                "form NOT IN (SELECT form FROM afrlex) " +
110
                                                "AND cat NOT IN (SELECT cat FROM afrlex) ";  
111
                HSQLFunctions.executeQuery( query );
112

    
113
                
114
                //check for missing cat correspondence
115
                String query2 = "SELECT cat from afrlex WHERE cat NOT IN (SELECT catcna FROM TTnca_2_CTX9);"
116
                int numberOfCatNotSpecif = 0
117
                def addedCat = [];
118
                HSQLFunctions.getGroovySql().eachRow(query2) {
119
                        if(!addedCat.contains(it.getAt(0)))
120
                        {
121
                                addedCat.add(it.getAt(0));
122
                                println("!! No correspondance for "+ it.getAt(0));
123
                                HSQLFunctions.executeQuery("INSERT INTO TTnca_2_CTX9 VALUES ('"+it.getAt(0)+"','"+it.getAt(0)+"ERROR')");
124
                                numberOfCatNotSpecif++;
125
                        }
126
                }
127
                
128
                //Substitution des cat >> lexRGAQCJ(form,cat,val)
129
                String query3 = "UPDATE afrlex " +
130
                                "SET cat = (SELECT catctx9 FROM TTnca_2_CTX9 WHERE cat=catcna)";                                
131
                HSQLFunctions.executeQuery( query3 );
132
                
133
                HSQLFunctions.printTable "afrlex";
134
        
135
                //export lexRGAQCJ >> lexRGAQCJ.txt
136
                File rezlexRGAQCJFile = new File(rootdir, "lexfra-rgaqcj.txt");
137
                HSQLFunctions.toRefFile( "afrlex", rezlexRGAQCJFile.getAbsolutePath(),"form");
138
                
139
        }
140
}