root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / nlp / BuildAfrLexicon.groovy @ 1000
History | View | Annotate | Download (4.7 kB)
1 | 321 | mdecorde | // Copyright © 2010-2013 ENS de Lyon.
|
---|---|---|---|
2 | 321 | mdecorde | // Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
|
3 | 321 | mdecorde | // Lyon 2, University of Franche-Comté, University of Nice
|
4 | 321 | mdecorde | // Sophia Antipolis, University of Paris 3.
|
5 | 321 | mdecorde | //
|
6 | 321 | mdecorde | // The TXM platform is free software: you can redistribute it
|
7 | 321 | mdecorde | // and/or modify it under the terms of the GNU General Public
|
8 | 321 | mdecorde | // License as published by the Free Software Foundation,
|
9 | 321 | mdecorde | // either version 2 of the License, or (at your option) any
|
10 | 321 | mdecorde | // later version.
|
11 | 321 | mdecorde | //
|
12 | 321 | mdecorde | // The TXM platform is distributed in the hope that it will be
|
13 | 321 | mdecorde | // useful, but WITHOUT ANY WARRANTY; without even the implied
|
14 | 321 | mdecorde | // warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
15 | 321 | mdecorde | // PURPOSE. See the GNU General Public License for more
|
16 | 321 | mdecorde | // details.
|
17 | 321 | mdecorde | //
|
18 | 321 | mdecorde | // You should have received a copy of the GNU General
|
19 | 321 | mdecorde | // Public License along with the TXM platform. If not, see
|
20 | 321 | mdecorde | // http://www.gnu.org/licenses.
|
21 | 321 | mdecorde | //
|
22 | 321 | mdecorde | //
|
23 | 321 | mdecorde | //
|
24 | 479 | mdecorde | // $LastChangedDate: 2013-05-06 17:38:43 +0200 (lun. 06 mai 2013) $
|
25 | 321 | mdecorde | // $LastChangedRevision: 2386 $
|
26 | 321 | mdecorde | // $LastChangedBy: mdecorde $
|
27 | 321 | mdecorde | //
|
28 | 1000 | mdecorde | package org.txm.scripts.scripts;
|
29 | 321 | mdecorde | |
30 | 321 | mdecorde | import groovy.lang.GroovyClassLoader |
31 | 321 | mdecorde | |
32 | 321 | mdecorde | import java.io.File |
33 | 321 | mdecorde | |
34 | 321 | mdecorde | import org.txm.utils.ProcessBuilderBuilder |
35 | 321 | mdecorde | // TODO: Auto-generated Javadoc
|
36 | 321 | mdecorde | |
37 | 321 | mdecorde | /**
|
38 | 321 | mdecorde | * build an afr lexicon in a SQL table.
|
39 | 321 | mdecorde | *
|
40 | 321 | mdecorde | * @author mdecorde
|
41 | 321 | mdecorde | */
|
42 | 321 | mdecorde | class BuildAfrLexicon |
43 | 321 | mdecorde | { |
44 | 321 | mdecorde | |
45 | 321 | mdecorde | /**
|
46 | 321 | mdecorde | * The main method.
|
47 | 321 | mdecorde | *
|
48 | 321 | mdecorde | * @param args the arguments
|
49 | 321 | mdecorde | */
|
50 | 321 | mdecorde | static void main(String[] args) |
51 | 321 | mdecorde | { |
52 | 321 | mdecorde | HSQLFunctions.clearAll(); |
53 | 321 | mdecorde | |
54 | 321 | mdecorde | //get lexicon from TXM RGAQCJ
|
55 | 321 | mdecorde | ProcessBuilderBuilder.build(new File("src/groovy/org/txm/scripts/cwb-decode-wrapper-definition.xml"), new File("src/groovy/org/textometrie/scripts/CwbDecode.groovy")); |
56 | 321 | mdecorde | |
57 | 321 | mdecorde | GroovyClassLoader gcl = new GroovyClassLoader();
|
58 | 321 | mdecorde | gcl.addClasspath(".");
|
59 | 321 | mdecorde | |
60 | 321 | mdecorde | String registryPath = "~/Bureau/trunkCWB/corpora/registry"; |
61 | 321 | mdecorde | //String registryPath = "C:/Documents and Settings/sheiden/.txm/registry"
|
62 | 321 | mdecorde | String cwbdecodeexecDir = "~/Bureau/trunkCWB/cwb-3.0/utils/" |
63 | 321 | mdecorde | //String cwbdecodeexecDir = "C:/Projets/Textom�trie/Logiciel/Toolbox/0.4.5/toolbox/src/main/C/cwb-3.0/utils/"
|
64 | 321 | mdecorde | |
65 | 321 | mdecorde | Class clazz = gcl.parseClass(new File("src/groovy/org/txm/scripts/CwbDecode.groovy")); |
66 | 321 | mdecorde | def aScript = clazz.newInstance(cwbdecodeexecDir);
|
67 | 321 | mdecorde | |
68 | 321 | mdecorde | aScript.setC() |
69 | 321 | mdecorde | def pAttributes = ["word","CATTEX2009","LEMMA"]; |
70 | 321 | mdecorde | def sAttributes = ["s"]; |
71 | 321 | mdecorde | aScript.setP(pAttributes) |
72 | 321 | mdecorde | //aScript.setS(sAttributes)
|
73 | 321 | mdecorde | aScript.setr(registryPath) |
74 | 321 | mdecorde | |
75 | 321 | mdecorde | String rootdir = "~/xml/rgaqcj/lexicon"; |
76 | 321 | mdecorde | File lexRGAQCJFile = new File(rootdir, "lexRGAQCJ.txt"); |
77 | 321 | mdecorde | FileOutputStream fos = new FileOutputStream(lexRGAQCJFile); |
78 | 321 | mdecorde | PrintStream ps = new PrintStream(fos); |
79 | 321 | mdecorde | def out = System.out; |
80 | 321 | mdecorde | System.setOut(ps);
|
81 | 321 | mdecorde | |
82 | 321 | mdecorde | if(System.getProperty("os.name").contains("Windows")) |
83 | 321 | mdecorde | aScript.cwbdecodeexe("RGAQCJ")
|
84 | 321 | mdecorde | else
|
85 | 321 | mdecorde | aScript.cwbdecode("RGAQCJ")
|
86 | 321 | mdecorde | System.setOut(out);
|
87 | 321 | mdecorde | println("fin")
|
88 | 321 | mdecorde | //Load lexRGAQCJ.txt >> lexRGAQCJ(form,cat,val)
|
89 | 321 | mdecorde | String[] argsname = ["form","cat","val"]; |
90 | 321 | mdecorde | String[] types = ["VARCHAR(30)","VARCHAR(30)","VARCHAR(100)"]; |
91 | 321 | mdecorde | |
92 | 321 | mdecorde | HSQLFunctions.ImportRefTable("lexRGAQCJ", argsname,types,lexRGAQCJFile,"\t","ISO-8859-1"); |
93 | 321 | mdecorde | |
94 | 321 | mdecorde | //load afrlex.txt >> afrlex(form,cat,val)
|
95 | 321 | mdecorde | File afrlexFile = new File(rootdir, "afrlex.txt"); |
96 | 321 | mdecorde | HSQLFunctions.ImportRefTable("afrlex", argsname,types,afrlexFile,"\t","ISO-8859-1") |
97 | 321 | mdecorde | |
98 | 321 | mdecorde | //Load TTcna_2_CTX9.txt >> TTcna_2_CTX9(catcna,catctx9)
|
99 | 321 | mdecorde | File correspFile = new File(rootdir, "TTnca_2_CTX9.txt"); |
100 | 321 | mdecorde | argsname = ["catcna","catctx9"]; |
101 | 321 | mdecorde | types = ["VARCHAR(20)","VARCHAR(20)"]; |
102 | 321 | mdecorde | HSQLFunctions.ImportCSVTable("TTnca_2_CTX9", argsname,types,correspFile,"\t","ISO-8859-1") |
103 | 321 | mdecorde | |
104 | 321 | mdecorde | //Union(lexRGAQCJ, afrlex) >> lextotal(form,cat,val)
|
105 | 321 | mdecorde | //HSQLFunctions.CreateTable( "lextotal", argsname, types);
|
106 | 321 | mdecorde | String query = "INSERT INTO afrlex " + |
107 | 321 | mdecorde | "SELECT * from lexRGAQCJ "+
|
108 | 321 | mdecorde | "WHERE " +
|
109 | 321 | mdecorde | "form NOT IN (SELECT form FROM afrlex) " +
|
110 | 321 | mdecorde | "AND cat NOT IN (SELECT cat FROM afrlex) ";
|
111 | 321 | mdecorde | HSQLFunctions.executeQuery( query ); |
112 | 321 | mdecorde | |
113 | 321 | mdecorde | |
114 | 321 | mdecorde | //check for missing cat correspondence
|
115 | 321 | mdecorde | String query2 = "SELECT cat from afrlex WHERE cat NOT IN (SELECT catcna FROM TTnca_2_CTX9);" |
116 | 321 | mdecorde | int numberOfCatNotSpecif = 0 |
117 | 321 | mdecorde | def addedCat = []; |
118 | 321 | mdecorde | HSQLFunctions.getGroovySql().eachRow(query2) { |
119 | 321 | mdecorde | if(!addedCat.contains(it.getAt(0))) |
120 | 321 | mdecorde | { |
121 | 321 | mdecorde | addedCat.add(it.getAt(0)); |
122 | 321 | mdecorde | println("!! No correspondance for "+ it.getAt(0)); |
123 | 321 | mdecorde | HSQLFunctions.executeQuery("INSERT INTO TTnca_2_CTX9 VALUES ('"+it.getAt(0)+"','"+it.getAt(0)+"ERROR')"); |
124 | 321 | mdecorde | numberOfCatNotSpecif++; |
125 | 321 | mdecorde | } |
126 | 321 | mdecorde | } |
127 | 321 | mdecorde | |
128 | 321 | mdecorde | //Substitution des cat >> lexRGAQCJ(form,cat,val)
|
129 | 321 | mdecorde | String query3 = "UPDATE afrlex " + |
130 | 321 | mdecorde | "SET cat = (SELECT catctx9 FROM TTnca_2_CTX9 WHERE cat=catcna)";
|
131 | 321 | mdecorde | HSQLFunctions.executeQuery( query3 ); |
132 | 321 | mdecorde | |
133 | 321 | mdecorde | HSQLFunctions.printTable "afrlex";
|
134 | 321 | mdecorde | |
135 | 321 | mdecorde | //export lexRGAQCJ >> lexRGAQCJ.txt
|
136 | 321 | mdecorde | File rezlexRGAQCJFile = new File(rootdir, "lexfra-rgaqcj.txt"); |
137 | 321 | mdecorde | HSQLFunctions.toRefFile( "afrlex", rezlexRGAQCJFile.getAbsolutePath(),"form"); |
138 | 321 | mdecorde | |
139 | 321 | mdecorde | } |
140 | 321 | mdecorde | } |