Révision 3519
TXM/trunk/org.txm.utils/src/org/txm/utils/FileUtils.java (revision 3519) | ||
---|---|---|
44 | 44 |
return FilenameUtils.getBaseName(f.getName()); |
45 | 45 |
} |
46 | 46 |
|
47 |
public static String stripExtension(String filename) { |
|
48 |
return FilenameUtils.getBaseName(filename); |
|
49 |
} |
|
50 |
|
|
47 | 51 |
/** |
48 | 52 |
* select file and directories |
49 | 53 |
* ignore hidden files |
TXM/trunk/org.txm.connlu.core/groovy/org/txm/scripts/importer/conllu/CoNLLUImport.groovy (revision 3519) | ||
---|---|---|
10 | 10 |
import org.txm.importer.ApplyXsl2; |
11 | 11 |
import org.txm.importer.xtz.* |
12 | 12 |
import org.txm.objects.Project |
13 |
import org.txm.utils.BundleUtils; |
|
14 | 13 |
import org.txm.utils.io.FileCopy; |
15 | 14 |
import org.txm.utils.io.IOUtils |
16 | 15 |
import org.apache.log4j.BasicConfigurator; |
... | ... | |
50 | 49 |
CallUD2TigerPerlScript cutps = new CallUD2TigerPerlScript(); |
51 | 50 |
|
52 | 51 |
if (cutps.canBuildTSFiles()) { |
53 |
cutps.convertCoNLLUFiles(this.sourceDirectory.getAbsolutePath(), this.binaryDirectory.getAbsolutePath()) |
|
54 | 52 |
|
53 |
println "Converting CoNLL-U files to TIGER-XML files..." |
|
54 |
|
|
55 |
cutps.convertCoNLLUFiles(this.sourceDirectory.listFiles(), this.binaryDirectory.getAbsolutePath()) |
|
56 |
|
|
55 | 57 |
File tigerXMLDirectory = new File(this.binaryDirectory, "tiger-xml") |
56 | 58 |
String driverFilename = TigerSearchPreferences.getInstance().getString(TigerSearchPreferences.DRIVER_FILENAME); |
57 | 59 |
File xslfile1 = BundleUtils.getFile("org.txm.conllu.core", "groovy", "/org/txm/scripts/importer/conllu/", "tigerXml-commentOutLongSentences.xsl") |
58 |
File xslfile2 = BundleUtils.getFile("org.txm.conllu.core", "groovy", "/org/txm/scripts/importer/conllu/", "tigerXml-sortBfmByDate.xsl") |
|
60 |
//File xslfile2 = BundleUtils.getFile("org.txm.conllu.core", "groovy", "/org/txm/scripts/importer/conllu/", "tigerXml-sortBfmByDate.xsl")
|
|
59 | 61 |
|
60 | 62 |
println "Post-processing TIGER-XML files..." |
61 | 63 |
ApplyXsl2 a1 = new ApplyXsl2(xslfile1); |
62 |
ApplyXsl2 a2 = new ApplyXsl2(xslfile2); |
|
64 |
//ApplyXsl2 a2 = new ApplyXsl2(xslfile2);
|
|
63 | 65 |
for (File xmlFile : tigerXMLDirectory.listFiles()) { |
64 | 66 |
|
65 | 67 |
if (!xmlFile.getName().endsWith(".xml")) continue; |
66 |
if (xmlFile.getName().equals(driverFilename)) continue; |
|
68 |
if (xmlFile.getName().equals(driverFilename)) continue; // don't process the driver
|
|
67 | 69 |
|
68 | 70 |
File xmlFileTmp = new File(xmlFile.getAbsolutePath()+".tmp") |
69 | 71 |
if (!(a1.process(xmlFile, xmlFileTmp) && xmlFile.delete() && xmlFileTmp.renameTo(xmlFile))) { |
70 | 72 |
println "Error while applying $xslfile1 to $xmlFile" |
71 | 73 |
} |
72 | 74 |
|
73 |
if (!(a2.process(xmlFile, xmlFileTmp) && xmlFile.delete() && xmlFileTmp.renameTo(xmlFile))) { |
|
74 |
println "Error while applying $xslfile2 to $xmlFile" |
|
75 |
// if (!(a2.process(xmlFile, xmlFileTmp) && xmlFile.delete() && xmlFileTmp.renameTo(xmlFile))) { |
|
76 |
// println "Error while applying $xslfile2 to $xmlFile" |
|
77 |
// } |
|
78 |
} |
|
79 |
|
|
80 |
println "Patching TIGER-XML driver file..." |
|
81 |
def tigerxmlFiles = [] |
|
82 |
def xmltxmFilesNames = this.getTXMFilesOrder(); |
|
83 |
println "xml-txm files: "+xmltxmFilesNames |
|
84 |
for (String name : xmltxmFilesNames) { |
|
85 |
name = FileUtils.stripExtension(name); |
|
86 |
|
|
87 |
File connluFile = new File(this.binaryDirectory, "tiger-xml/"+name+".xml") |
|
88 |
println " test "+connluFile |
|
89 |
if (connluFile.exists()) { |
|
90 |
tigerxmlFiles << connluFile |
|
75 | 91 |
} |
76 | 92 |
} |
77 | 93 |
|
94 |
// patch the subcorpus tags in the driver XML file with the right corpus order |
|
95 |
File driver = new File(this.binaryDirectory, "tiger-xml/"+driverFilename) |
|
96 |
String content = IOUtils.getText(driver, "UTF-8"); |
|
97 |
content = content.replaceAll("<subcorpus .+\n", ""); |
|
98 |
String subcorpusList = ""; |
|
99 |
for (String name : xmltxmFilesNames) { |
|
100 |
name = FileUtils.stripExtension(name); |
|
101 |
subcorpusList += "<subcorpus name=\"$name\" external=\"file:${name}.xml\"/>\n" |
|
102 |
} |
|
103 |
content = content.replaceAll("<body>", "<body>\n"+subcorpusList+"\n"); // get the last main.xml content and patch it with the subcorpus tags |
|
104 |
IOUtils.setText(driver, content, "UTF-8"); |
|
78 | 105 |
// build TIGER indexes |
79 | 106 |
if (isSuccessful) { |
80 | 107 |
// read from the 'tiger-xml' and write to the 'tiger' directory |
TXM/trunk/org.txm.connlu.core/groovy/org/txm/scripts/importer/conllu/CoNLLUImporter.groovy (revision 3519) | ||
---|---|---|
62 | 62 |
|
63 | 63 |
boolean keepContractions = UDPreferences.getInstance().getString(UDPreferences.KEEP_CONTRACTIONS) |
64 | 64 |
|
65 |
boolean usenewdocid = UDPreferences.getInstance().getString(UDPreferences.IMPORT_USE_NEW_DOC_ID) |
|
65 |
boolean usenewdocid = false; // THE conllu -> Tiger XSL MUST HAVE THE SAME BEHAVIOR BEFORE // UDPreferences.getInstance().getString(UDPreferences.IMPORT_USE_NEW_DOC_ID)
|
|
66 | 66 |
|
67 | 67 |
def headPropertiesToProject = UDPreferences.getInstance().getString(UDPreferences.IMPORT_HEAD_TO_PROJECT).split(",") as Set |
68 | 68 |
|
TXM/trunk/org.txm.connlu.core/groovy/org/txm/scripts/importer/conllu/conll2tiger-ud.pl (revision 3519) | ||
---|---|---|
172 | 172 |
if ( $infile eq '' ) { |
173 | 173 |
$infile = 'subcorpus'; |
174 | 174 |
} |
175 |
my $counter = 1; |
|
176 |
$suffix = sprintf( "%05d", $counter ); |
|
175 |
#MD disable the counter and suffix variables and file renaming "$infilename-$suffix" -> "$infilename" |
|
176 |
#my $counter = 1; |
|
177 |
#$suffix = sprintf( "%05d", $counter ); |
|
177 | 178 |
$infilename = basename($infile); |
178 | 179 |
|
179 | 180 |
mkdir("$outdir"); |
180 |
open( XML, ">$outdir/$infilename-$suffix.xml" )
|
|
181 |
or die "\nopen file error of $outdir/$infilename-$suffix.xml\n";
|
|
181 |
open( XML, ">$outdir/$infilename.xml" ) |
|
182 |
or die "\nopen file error of $outdir/$infilename.xml\n"; |
|
182 | 183 |
open( LOG, ">$outdir/conversion.log" ) |
183 | 184 |
or die "\nopen file error of conversion.log\n"; |
184 | 185 |
open( MASTER, ">$outdir/main.xml" ) or die "\nopen file error of main.xml\n"; |
... | ... | |
194 | 195 |
$commandline = $0 . " " . ( join " ", @ARGV ); |
195 | 196 |
print LOG "$commandline\n\n"; |
196 | 197 |
|
197 |
print MASTER "<subcorpus name='$infilename-$suffix' external='file:$infilename-$suffix.xml'/>\n";
|
|
198 |
print MASTER "<subcorpus name='$infilename' external='file:$infilename.xml'/>\n";
|
|
198 | 199 |
|
199 | 200 |
$/ = ""; # treat empty line as RS |
200 | 201 |
while (<>) { |
201 | 202 |
if ( $. % $split == 0 ) { |
202 | 203 |
print XML "</subcorpus>\n"; |
203 | 204 |
close(XML); |
204 |
$suffix = sprintf( "%05d", ++$counter ); |
|
205 |
open( XML, ">$outdir/$infilename-$suffix.xml" )
|
|
205 |
#$suffix = sprintf( "%05d", ++$counter );
|
|
206 |
open( XML, ">$outdir/$infilename.xml" ) |
|
206 | 207 |
or die "\nopen file error\n"; |
207 | 208 |
write_xml_header(); |
208 |
print MASTER "<subcorpus name='$infilename-$suffix' external='file:$infilename-$suffix.xml'/>\n";
|
|
209 |
print MASTER "<subcorpus name='$infilename' external='file:$infilename.xml'/>\n";
|
|
209 | 210 |
} |
210 | 211 |
|
211 | 212 |
# ---------------------------------------- |
... | ... | |
551 | 552 |
|
552 | 553 |
sub write_xml_header { |
553 | 554 |
print XML "<?xml version=\"1.0\" encoding=\"UTF-8\"?> |
554 |
<subcorpus name=\"$infilename-$suffix\">
|
|
555 |
<subcorpus name=\"$infilename\"> |
|
555 | 556 |
"; |
556 | 557 |
} |
557 | 558 |
|
TXM/trunk/org.txm.connlu.core/src/org/txm/conllu/core/CallUD2TigerPerlScript.java (revision 3519) | ||
---|---|---|
4 | 4 |
import java.io.IOException; |
5 | 5 |
import java.util.ArrayList; |
6 | 6 |
import java.util.Arrays; |
7 |
import java.util.Comparator; |
|
7 | 8 |
|
8 | 9 |
import org.eclipse.core.runtime.IStatus; |
9 | 10 |
import org.txm.core.engines.ScriptEngine; |
... | ... | |
28 | 29 |
return true; |
29 | 30 |
} |
30 | 31 |
|
31 |
public boolean convertCoNLLUFiles(String input_directory_path, String output_directory) throws IOException {
|
|
32 |
public boolean convertCoNLLUFiles(File[] connluFiles, String output_directory) throws IOException {
|
|
32 | 33 |
|
33 |
File input_directory = new File(input_directory_path);
|
|
34 |
if (connluFiles.length == 0) return false;
|
|
34 | 35 |
|
35 |
if (input_directory_path.length() == 0 && input_directory.canRead()) { |
|
36 |
System.out.println("** impossible to access directory "+input_directory+". Aborting."); |
|
37 |
return false; |
|
38 |
} |
|
39 |
|
|
40 | 36 |
File tigerXMLDirectory = new File(output_directory, "tiger-xml"); |
41 | 37 |
File conversionFile = new File(tigerXMLDirectory, "conversion.log"); |
42 | 38 |
|
... | ... | |
53 | 49 |
//println "Converting..." |
54 | 50 |
|
55 | 51 |
String subcorpusList = ""; |
56 |
for (File f : input_directory.listFiles()) { |
|
52 |
|
|
53 |
for (File f : connluFiles) { |
|
57 | 54 |
|
58 | 55 |
if (!f.getName().endsWith(".conllu")) { |
59 | 56 |
continue; |
Formats disponibles : Unified diff