root / projets / Palafra / frolex / lgerm-processed / lgerm-split.pl @ 496
History | View | Annotate | Download (1003 Bytes)
1 |
use strict; |
---|---|
2 |
use warnings; |
3 |
use utf8; |
4 |
use open ':encoding(utf8)'; |
5 |
|
6 |
|
7 |
|
8 |
############# |
9 |
|
10 |
|
11 |
|
12 |
my $file = "LGeRM-lexique"; |
13 |
|
14 |
open (IN,"$file.xml") || die "Cannot read from $file.tsv\n"; |
15 |
open (OUT, ">$file.$$") || die "Cannot write to $file.$$\n"; |
16 |
|
17 |
print "Processing $file.xml\n"; |
18 |
|
19 |
my @lines = <IN>; close IN; |
20 |
|
21 |
my $counter_lines = 0; |
22 |
my $counter_parts = 1; |
23 |
|
24 |
for my $line (@lines) { |
25 |
if ($counter_lines == 0 and $counter_parts > 1) { |
26 |
print OUT "<LexicalResource>\n" |
27 |
} |
28 |
$counter_lines++; |
29 |
if ($counter_lines < 1000000) { |
30 |
print OUT $line; |
31 |
} |
32 |
elsif ($line =~ /<\/lexicalEntry>/) { |
33 |
print OUT $line; |
34 |
print OUT "</LexicalResource>"; |
35 |
close OUT; |
36 |
rename "$file.$$","$file-$counter_parts.xml" || die "Cannot write to $file-$counter_parts.xml\n"; |
37 |
$counter_parts++; |
38 |
$counter_lines = 0; |
39 |
open (OUT, ">$file.$$") || die "Cannot write to $file.$$\n"; |
40 |
} |
41 |
else { |
42 |
print OUT $line; |
43 |
} |
44 |
} |
45 |
close OUT; |
46 |
rename "$file.$$","$file-$counter_parts.xml" || die "Cannot write to $file-$counter_parts.xml\n"; |
47 |
print "Done\n"; |
48 |
|
49 |
|