Statistics
| Revision:

root / projets / Palafra / frolex / lgerm-processed / lgerm-split.pl @ 496

History | View | Annotate | Download (1003 Bytes)

1
use strict;
2
use warnings;
3
use utf8;
4
use open ':encoding(utf8)';
5

    
6

    
7

    
8
#############
9

    
10

    
11

    
12
my $file = "LGeRM-lexique";
13

    
14
open (IN,"$file.xml") || die "Cannot read from $file.tsv\n";
15
open (OUT, ">$file.$$") || die "Cannot write to $file.$$\n";
16

    
17
print "Processing $file.xml\n";
18

    
19
my @lines = <IN>; close IN;
20

    
21
my $counter_lines = 0;
22
my $counter_parts = 1;
23

    
24
for my $line (@lines) {
25
	if ($counter_lines == 0 and $counter_parts > 1) {
26
		print OUT "<LexicalResource>\n"
27
	}
28
	$counter_lines++;
29
	if ($counter_lines < 1000000) {
30
		print OUT $line;
31
	}
32
	elsif ($line =~ /<\/lexicalEntry>/) {
33
		print OUT $line;
34
		print OUT "</LexicalResource>";
35
		close OUT;
36
		rename "$file.$$","$file-$counter_parts.xml" || die "Cannot write to $file-$counter_parts.xml\n";
37
		$counter_parts++;
38
		$counter_lines = 0;
39
		open (OUT, ">$file.$$") || die "Cannot write to $file.$$\n";
40
	}
41
	else {
42
		print OUT $line;
43
	}
44
}
45
close OUT;
46
rename "$file.$$","$file-$counter_parts.xml" || die "Cannot write to $file-$counter_parts.xml\n";
47
print "Done\n";
48

    
49