Révision 3519

TXM/trunk/org.txm.utils/src/org/txm/utils/FileUtils.java (revision 3519)
44 44
		return FilenameUtils.getBaseName(f.getName());
45 45
	}
46 46
	
47
	public static String stripExtension(String filename) {
48
		return FilenameUtils.getBaseName(filename);
49
	}
50
	
47 51
	/**
48 52
	 * select file and directories
49 53
	 * ignore hidden files
TXM/trunk/org.txm.connlu.core/groovy/org/txm/scripts/importer/conllu/CoNLLUImport.groovy (revision 3519)
10 10
import org.txm.importer.ApplyXsl2;
11 11
import org.txm.importer.xtz.*
12 12
import org.txm.objects.Project
13
import org.txm.utils.BundleUtils;
14 13
import org.txm.utils.io.FileCopy;
15 14
import org.txm.utils.io.IOUtils
16 15
import org.apache.log4j.BasicConfigurator;
......
50 49
		CallUD2TigerPerlScript cutps = new CallUD2TigerPerlScript();
51 50
		
52 51
		if (cutps.canBuildTSFiles()) {
53
			cutps.convertCoNLLUFiles(this.sourceDirectory.getAbsolutePath(), this.binaryDirectory.getAbsolutePath())
54 52
			
53
			println "Converting CoNLL-U files to TIGER-XML files..."
54
			
55
			cutps.convertCoNLLUFiles(this.sourceDirectory.listFiles(), this.binaryDirectory.getAbsolutePath())
56
			
55 57
			File tigerXMLDirectory = new File(this.binaryDirectory, "tiger-xml")
56 58
			String driverFilename = TigerSearchPreferences.getInstance().getString(TigerSearchPreferences.DRIVER_FILENAME);
57 59
			File xslfile1 = BundleUtils.getFile("org.txm.conllu.core", "groovy", "/org/txm/scripts/importer/conllu/", "tigerXml-commentOutLongSentences.xsl")
58
			File xslfile2 = BundleUtils.getFile("org.txm.conllu.core", "groovy", "/org/txm/scripts/importer/conllu/", "tigerXml-sortBfmByDate.xsl")
60
			//File xslfile2 = BundleUtils.getFile("org.txm.conllu.core", "groovy", "/org/txm/scripts/importer/conllu/", "tigerXml-sortBfmByDate.xsl")
59 61
			
60 62
			println "Post-processing TIGER-XML files..."
61 63
			ApplyXsl2 a1 = new ApplyXsl2(xslfile1);
62
			ApplyXsl2 a2 = new ApplyXsl2(xslfile2);
64
			//ApplyXsl2 a2 = new ApplyXsl2(xslfile2);
63 65
			for (File xmlFile : tigerXMLDirectory.listFiles()) {
64 66
				
65 67
				if (!xmlFile.getName().endsWith(".xml")) continue;
66
				if (xmlFile.getName().equals(driverFilename)) continue;
68
				if (xmlFile.getName().equals(driverFilename)) continue; // don't process the driver
67 69
				
68 70
				File xmlFileTmp = new File(xmlFile.getAbsolutePath()+".tmp")
69 71
				if (!(a1.process(xmlFile, xmlFileTmp) && xmlFile.delete() && xmlFileTmp.renameTo(xmlFile))) {
70 72
					println "Error while applying $xslfile1 to $xmlFile"
71 73
				}
72 74
				
73
				if (!(a2.process(xmlFile, xmlFileTmp) && xmlFile.delete() && xmlFileTmp.renameTo(xmlFile))) {
74
					println "Error while applying $xslfile2 to $xmlFile"
75
//				if (!(a2.process(xmlFile, xmlFileTmp) && xmlFile.delete() && xmlFileTmp.renameTo(xmlFile))) {
76
//					println "Error while applying $xslfile2 to $xmlFile"
77
//				}
78
			}
79
			
80
			println "Patching TIGER-XML driver file..."
81
			def tigerxmlFiles = []
82
			def xmltxmFilesNames = this.getTXMFilesOrder();
83
			println "xml-txm files: "+xmltxmFilesNames
84
			for (String name : xmltxmFilesNames) {
85
				name = FileUtils.stripExtension(name);
86
				
87
				File connluFile = new File(this.binaryDirectory, "tiger-xml/"+name+".xml")
88
				println " test "+connluFile
89
				if (connluFile.exists()) {
90
					tigerxmlFiles << connluFile
75 91
				}
76 92
			}
77 93
			
94
			// patch the subcorpus tags in the driver XML file with the right corpus order
95
			File driver = new File(this.binaryDirectory, "tiger-xml/"+driverFilename)
96
			String content = IOUtils.getText(driver, "UTF-8");
97
			content = content.replaceAll("<subcorpus .+\n", "");
98
			String subcorpusList = "";
99
			for (String name : xmltxmFilesNames) {
100
				name = FileUtils.stripExtension(name);
101
				subcorpusList += "<subcorpus name=\"$name\" external=\"file:${name}.xml\"/>\n"
102
			}
103
			content = content.replaceAll("<body>", "<body>\n"+subcorpusList+"\n"); // get the last main.xml content and patch it with the subcorpus tags
104
			IOUtils.setText(driver, content, "UTF-8");
78 105
			// build TIGER indexes
79 106
			if (isSuccessful) {
80 107
				// read from the 'tiger-xml' and write to the 'tiger' directory
TXM/trunk/org.txm.connlu.core/groovy/org/txm/scripts/importer/conllu/CoNLLUImporter.groovy (revision 3519)
62 62
		
63 63
		boolean keepContractions =  UDPreferences.getInstance().getString(UDPreferences.KEEP_CONTRACTIONS)
64 64
		
65
		boolean usenewdocid =  UDPreferences.getInstance().getString(UDPreferences.IMPORT_USE_NEW_DOC_ID)
65
		boolean usenewdocid =  false; // THE conllu -> Tiger XSL MUST HAVE THE SAME BEHAVIOR BEFORE // UDPreferences.getInstance().getString(UDPreferences.IMPORT_USE_NEW_DOC_ID)
66 66
		
67 67
		def headPropertiesToProject = UDPreferences.getInstance().getString(UDPreferences.IMPORT_HEAD_TO_PROJECT).split(",") as Set
68 68
		
TXM/trunk/org.txm.connlu.core/groovy/org/txm/scripts/importer/conllu/conll2tiger-ud.pl (revision 3519)
172 172
if ( $infile eq '' ) {
173 173
	$infile = 'subcorpus';
174 174
}
175
my $counter = 1;
176
$suffix = sprintf( "%05d", $counter );
175
#MD disable the counter and suffix variables and file renaming "$infilename-$suffix" -> "$infilename"
176
#my $counter = 1;
177
#$suffix = sprintf( "%05d", $counter );
177 178
$infilename = basename($infile);
178 179

  
179 180
mkdir("$outdir");
180
open( XML, ">$outdir/$infilename-$suffix.xml" )
181
  or die "\nopen file error of $outdir/$infilename-$suffix.xml\n";
181
open( XML, ">$outdir/$infilename.xml" )
182
  or die "\nopen file error of $outdir/$infilename.xml\n";
182 183
open( LOG, ">$outdir/conversion.log" )
183 184
  or die "\nopen file error of conversion.log\n";
184 185
open( MASTER, ">$outdir/main.xml" ) or die "\nopen file error of main.xml\n";
......
194 195
$commandline = $0 . " " . ( join " ", @ARGV );
195 196
print LOG "$commandline\n\n";
196 197

  
197
print MASTER "<subcorpus name='$infilename-$suffix' external='file:$infilename-$suffix.xml'/>\n";
198
print MASTER "<subcorpus name='$infilename' external='file:$infilename.xml'/>\n";
198 199

  
199 200
$/ = "";    # treat empty line as RS
200 201
while (<>) {
201 202
	if ( $. % $split == 0 ) {
202 203
		print XML "</subcorpus>\n";
203 204
		close(XML);
204
		$suffix = sprintf( "%05d", ++$counter );
205
		open( XML, ">$outdir/$infilename-$suffix.xml" )
205
		#$suffix = sprintf( "%05d", ++$counter );
206
		open( XML, ">$outdir/$infilename.xml" )
206 207
		  or die "\nopen file error\n";
207 208
		write_xml_header();
208
		print MASTER "<subcorpus name='$infilename-$suffix' external='file:$infilename-$suffix.xml'/>\n";
209
		print MASTER "<subcorpus name='$infilename' external='file:$infilename.xml'/>\n";
209 210
	}
210 211

  
211 212
	# ----------------------------------------
......
551 552

  
552 553
sub write_xml_header {
553 554
	print XML "<?xml version=\"1.0\" encoding=\"UTF-8\"?>
554
  <subcorpus name=\"$infilename-$suffix\">
555
  <subcorpus name=\"$infilename\">
555 556
";
556 557
}
557 558

  
TXM/trunk/org.txm.connlu.core/src/org/txm/conllu/core/CallUD2TigerPerlScript.java (revision 3519)
4 4
import java.io.IOException;
5 5
import java.util.ArrayList;
6 6
import java.util.Arrays;
7
import java.util.Comparator;
7 8

  
8 9
import org.eclipse.core.runtime.IStatus;
9 10
import org.txm.core.engines.ScriptEngine;
......
28 29
		return true;
29 30
	}
30 31
	
31
	public boolean convertCoNLLUFiles(String input_directory_path, String output_directory) throws IOException {
32
	public boolean convertCoNLLUFiles(File[] connluFiles, String output_directory) throws IOException {
32 33
		
33
		File input_directory = new File(input_directory_path);
34
		if (connluFiles.length == 0) return false;
34 35
		
35
		if (input_directory_path.length() == 0 && input_directory.canRead()) {
36
			System.out.println("** impossible to access directory "+input_directory+". Aborting.");
37
			return false;
38
		}
39
		
40 36
		File tigerXMLDirectory = new File(output_directory, "tiger-xml");
41 37
		File conversionFile = new File(tigerXMLDirectory, "conversion.log");
42 38
		
......
53 49
		//println "Converting..."
54 50
		
55 51
		String subcorpusList = "";
56
		for (File f : input_directory.listFiles()) {
52

  
53
		for (File f : connluFiles) {
57 54
			
58 55
			if (!f.getName().endsWith(".conllu")) {
59 56
				continue;

Formats disponibles : Unified diff