Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / nlp / BuildAfrLexicon.groovy @ 1000

History | View | Annotate | Download (4.7 kB)

1 321 mdecorde
// Copyright © 2010-2013 ENS de Lyon.
2 321 mdecorde
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3 321 mdecorde
// Lyon 2, University of Franche-Comté, University of Nice
4 321 mdecorde
// Sophia Antipolis, University of Paris 3.
5 321 mdecorde
//
6 321 mdecorde
// The TXM platform is free software: you can redistribute it
7 321 mdecorde
// and/or modify it under the terms of the GNU General Public
8 321 mdecorde
// License as published by the Free Software Foundation,
9 321 mdecorde
// either version 2 of the License, or (at your option) any
10 321 mdecorde
// later version.
11 321 mdecorde
//
12 321 mdecorde
// The TXM platform is distributed in the hope that it will be
13 321 mdecorde
// useful, but WITHOUT ANY WARRANTY; without even the implied
14 321 mdecorde
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 321 mdecorde
// PURPOSE. See the GNU General Public License for more
16 321 mdecorde
// details.
17 321 mdecorde
//
18 321 mdecorde
// You should have received a copy of the GNU General
19 321 mdecorde
// Public License along with the TXM platform. If not, see
20 321 mdecorde
// http://www.gnu.org/licenses.
21 321 mdecorde
//
22 321 mdecorde
//
23 321 mdecorde
//
24 479 mdecorde
// $LastChangedDate: 2013-05-06 17:38:43 +0200 (lun. 06 mai 2013) $
25 321 mdecorde
// $LastChangedRevision: 2386 $
26 321 mdecorde
// $LastChangedBy: mdecorde $
27 321 mdecorde
//
28 1000 mdecorde
package org.txm.scripts.scripts;
29 321 mdecorde
30 321 mdecorde
import groovy.lang.GroovyClassLoader
31 321 mdecorde
32 321 mdecorde
import java.io.File
33 321 mdecorde
34 321 mdecorde
import org.txm.utils.ProcessBuilderBuilder
35 321 mdecorde
// TODO: Auto-generated Javadoc
36 321 mdecorde
37 321 mdecorde
/**
38 321 mdecorde
 * build an afr lexicon in a SQL table.
39 321 mdecorde
 *
40 321 mdecorde
 * @author mdecorde
41 321 mdecorde
 */
42 321 mdecorde
class BuildAfrLexicon
43 321 mdecorde
{
44 321 mdecorde
45 321 mdecorde
        /**
46 321 mdecorde
         * The main method.
47 321 mdecorde
         *
48 321 mdecorde
         * @param args the arguments
49 321 mdecorde
         */
50 321 mdecorde
        static void main(String[] args)
51 321 mdecorde
        {
52 321 mdecorde
                HSQLFunctions.clearAll();
53 321 mdecorde
54 321 mdecorde
                //get lexicon from TXM RGAQCJ
55 321 mdecorde
                ProcessBuilderBuilder.build(new File("src/groovy/org/txm/scripts/cwb-decode-wrapper-definition.xml"), new File("src/groovy/org/textometrie/scripts/CwbDecode.groovy"));
56 321 mdecorde
57 321 mdecorde
                GroovyClassLoader gcl = new GroovyClassLoader();
58 321 mdecorde
                gcl.addClasspath(".");
59 321 mdecorde
60 321 mdecorde
                String registryPath = "~/Bureau/trunkCWB/corpora/registry";
61 321 mdecorde
                //String registryPath = "C:/Documents and Settings/sheiden/.txm/registry"
62 321 mdecorde
                 String cwbdecodeexecDir = "~/Bureau/trunkCWB/cwb-3.0/utils/"
63 321 mdecorde
                //String cwbdecodeexecDir = "C:/Projets/Textom�trie/Logiciel/Toolbox/0.4.5/toolbox/src/main/C/cwb-3.0/utils/"
64 321 mdecorde
65 321 mdecorde
                Class clazz = gcl.parseClass(new File("src/groovy/org/txm/scripts/CwbDecode.groovy"));
66 321 mdecorde
                def aScript = clazz.newInstance(cwbdecodeexecDir);
67 321 mdecorde
68 321 mdecorde
                aScript.setC()
69 321 mdecorde
                def pAttributes = ["word","CATTEX2009","LEMMA"];
70 321 mdecorde
                def sAttributes = ["s"];
71 321 mdecorde
                aScript.setP(pAttributes)
72 321 mdecorde
                //aScript.setS(sAttributes)
73 321 mdecorde
                aScript.setr(registryPath)
74 321 mdecorde
75 321 mdecorde
String rootdir = "~/xml/rgaqcj/lexicon";
76 321 mdecorde
File lexRGAQCJFile = new File(rootdir, "lexRGAQCJ.txt");
77 321 mdecorde
FileOutputStream fos = new FileOutputStream(lexRGAQCJFile);
78 321 mdecorde
PrintStream ps = new PrintStream(fos);
79 321 mdecorde
def out = System.out;
80 321 mdecorde
System.setOut(ps);
81 321 mdecorde
82 321 mdecorde
                if(System.getProperty("os.name").contains("Windows"))
83 321 mdecorde
                        aScript.cwbdecodeexe("RGAQCJ")
84 321 mdecorde
                else
85 321 mdecorde
                        aScript.cwbdecode("RGAQCJ")
86 321 mdecorde
System.setOut(out);
87 321 mdecorde
println("fin")
88 321 mdecorde
                //Load lexRGAQCJ.txt >> lexRGAQCJ(form,cat,val)
89 321 mdecorde
                String[] argsname = ["form","cat","val"];
90 321 mdecorde
                String[] types = ["VARCHAR(30)","VARCHAR(30)","VARCHAR(100)"];
91 321 mdecorde
92 321 mdecorde
                HSQLFunctions.ImportRefTable("lexRGAQCJ", argsname,types,lexRGAQCJFile,"\t","ISO-8859-1");
93 321 mdecorde
94 321 mdecorde
                //load afrlex.txt >> afrlex(form,cat,val)
95 321 mdecorde
                File afrlexFile = new File(rootdir, "afrlex.txt");
96 321 mdecorde
                HSQLFunctions.ImportRefTable("afrlex", argsname,types,afrlexFile,"\t","ISO-8859-1")
97 321 mdecorde
98 321 mdecorde
                //Load TTcna_2_CTX9.txt >> TTcna_2_CTX9(catcna,catctx9)
99 321 mdecorde
                File correspFile = new File(rootdir, "TTnca_2_CTX9.txt");
100 321 mdecorde
                argsname = ["catcna","catctx9"];
101 321 mdecorde
                types = ["VARCHAR(20)","VARCHAR(20)"];
102 321 mdecorde
                HSQLFunctions.ImportCSVTable("TTnca_2_CTX9", argsname,types,correspFile,"\t","ISO-8859-1")
103 321 mdecorde
104 321 mdecorde
                //Union(lexRGAQCJ, afrlex) >> lextotal(form,cat,val)
105 321 mdecorde
                //HSQLFunctions.CreateTable( "lextotal", argsname, types);
106 321 mdecorde
                String query =         "INSERT INTO afrlex " +
107 321 mdecorde
                                                "SELECT * from lexRGAQCJ "+
108 321 mdecorde
                                                "WHERE " +
109 321 mdecorde
                                                "form NOT IN (SELECT form FROM afrlex) " +
110 321 mdecorde
                                                "AND cat NOT IN (SELECT cat FROM afrlex) ";
111 321 mdecorde
                HSQLFunctions.executeQuery( query );
112 321 mdecorde
113 321 mdecorde
114 321 mdecorde
                //check for missing cat correspondence
115 321 mdecorde
                String query2 = "SELECT cat from afrlex WHERE cat NOT IN (SELECT catcna FROM TTnca_2_CTX9);"
116 321 mdecorde
                int numberOfCatNotSpecif = 0
117 321 mdecorde
                def addedCat = [];
118 321 mdecorde
                HSQLFunctions.getGroovySql().eachRow(query2) {
119 321 mdecorde
                        if(!addedCat.contains(it.getAt(0)))
120 321 mdecorde
                        {
121 321 mdecorde
                                addedCat.add(it.getAt(0));
122 321 mdecorde
                                println("!! No correspondance for "+ it.getAt(0));
123 321 mdecorde
                                HSQLFunctions.executeQuery("INSERT INTO TTnca_2_CTX9 VALUES ('"+it.getAt(0)+"','"+it.getAt(0)+"ERROR')");
124 321 mdecorde
                                numberOfCatNotSpecif++;
125 321 mdecorde
                        }
126 321 mdecorde
                }
127 321 mdecorde
128 321 mdecorde
                //Substitution des cat >> lexRGAQCJ(form,cat,val)
129 321 mdecorde
                String query3 = "UPDATE afrlex " +
130 321 mdecorde
                                "SET cat = (SELECT catctx9 FROM TTnca_2_CTX9 WHERE cat=catcna)";
131 321 mdecorde
                HSQLFunctions.executeQuery( query3 );
132 321 mdecorde
133 321 mdecorde
                HSQLFunctions.printTable "afrlex";
134 321 mdecorde
135 321 mdecorde
                //export lexRGAQCJ >> lexRGAQCJ.txt
136 321 mdecorde
                File rezlexRGAQCJFile = new File(rootdir, "lexfra-rgaqcj.txt");
137 321 mdecorde
                HSQLFunctions.toRefFile( "afrlex", rezlexRGAQCJFile.getAbsolutePath(),"form");
138 321 mdecorde
139 321 mdecorde
        }
140 321 mdecorde
}