Statistics
| Revision:

root / tmp / org.txm.core / res / org / txm / xml / xsl / tei / docx / from / docxtotei.xsl @ 187

History | View | Annotate | Download (17.8 kB)

1
<?xml version="1.0" encoding="utf-8"?>
2
<xsl:stylesheet xmlns:xs="http://www.w3.org/2001/XMLSchema"
3
                xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
4
                xmlns:prop="http://schemas.openxmlformats.org/officeDocument/2006/custom-properties"
5
                xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
6
                xmlns:cp="http://schemas.openxmlformats.org/package/2006/metadata/core-properties"
7
                xmlns:dc="http://purl.org/dc/elements/1.1/"
8
                xmlns:dcterms="http://purl.org/dc/terms/"
9
                xmlns:dcmitype="http://purl.org/dc/dcmitype/"
10
                xmlns:iso="http://www.iso.org/ns/1.0"
11
                xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math"
12
                xmlns:mml="http://www.w3.org/1998/Math/MathML"
13
                xmlns:mo="http://schemas.microsoft.com/office/mac/office/2008/main"
14
                xmlns:mv="urn:schemas-microsoft-com:mac:vml"
15
                xmlns:o="urn:schemas-microsoft-com:office:office"
16
                xmlns:pic="http://schemas.openxmlformats.org/drawingml/2006/picture"
17
                xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
18
                xmlns:rel="http://schemas.openxmlformats.org/package/2006/relationships"
19
                xmlns:tbx="http://www.lisa.org/TBX-Specification.33.0.html"
20
		xmlns:html="http://www.w3.org/1999/xhtml"
21
                xmlns:tei="http://www.tei-c.org/ns/1.0"
22
                xmlns:teidocx="http://www.tei-c.org/ns/teidocx/1.0"
23
                xmlns:v="urn:schemas-microsoft-com:vml"
24
                xmlns:ve="http://schemas.openxmlformats.org/markup-compatibility/2006"
25
                xmlns:w10="urn:schemas-microsoft-com:office:word"
26
                xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"
27
                xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml"
28
                xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing"
29
                
30
                xmlns="http://www.tei-c.org/ns/1.0"
31
                version="2.0"
32
                exclude-result-prefixes="a cp dc dcterms dcmitype prop
33
					  iso m mml mo mv o pic r rel   html    tbx tei teidocx v xs ve w10 w wne wp">
34

    
35
	  <xsl:import href="../../common2/functions.xsl"/>
36
	  <xsl:import href="../utils/maths/omml2mml.xsl"/>
37
	  <xsl:import href="../utils/functions.xsl"/>
38
	  <xsl:import href="../utils/variables.xsl"/>
39
	  <xsl:import href="../utils/identity/identity.xsl"/>
40

    
41
	  <xsl:import href="parameters.xsl"/>
42
	  <xsl:include href="pass0/pass0.xsl"/>
43
	  <xsl:include href="pass2/pass2.xsl"/>
44
	
45
	  <xsl:include href="dynamic/fields.xsl"/>
46
	  <xsl:include href="dynamic/toc.xsl"/>
47
	  <xsl:include href="graphics/graphics.xsl"/>
48
	  <xsl:include href="lists/lists.xsl"/>
49
	  <xsl:include href="marginals/marginals.xsl"/>
50
	  <xsl:include href="maths/maths.xsl"/>
51
	  <xsl:include href="paragraphs/paragraphs.xsl"/>
52
	  <xsl:include href="tables/tables.xsl"/>
53
	  <xsl:include href="templates/tei-templates.xsl"/>
54
	  <xsl:include href="textruns/textruns.xsl"/>
55
	  <xsl:include href="utils/utility-templates.xsl"/>
56
	  <xsl:include href="wordsections/wordsections.xsl"/>
57
	
58
	
59
	  <doc xmlns="http://www.oxygenxml.com/ns/doc/xsl" scope="stylesheet" type="stylesheet">
60
      <desc>
61
         <p> TEI stylesheet for converting Word docx files to TEI </p>
62
         <p>This software is dual-licensed:
63

    
64
1. Distributed under a Creative Commons Attribution-ShareAlike 3.0
65
Unported License http://creativecommons.org/licenses/by-sa/3.0/ 
66

    
67
2. http://www.opensource.org/licenses/BSD-2-Clause
68
		
69
All rights reserved.
70

    
71
Redistribution and use in source and binary forms, with or without
72
modification, are permitted provided that the following conditions are
73
met:
74

    
75
* Redistributions of source code must retain the above copyright
76
notice, this list of conditions and the following disclaimer.
77

    
78
* Redistributions in binary form must reproduce the above copyright
79
notice, this list of conditions and the following disclaimer in the
80
documentation and/or other materials provided with the distribution.
81

    
82
This software is provided by the copyright holders and contributors
83
"as is" and any express or implied warranties, including, but not
84
limited to, the implied warranties of merchantability and fitness for
85
a particular purpose are disclaimed. In no event shall the copyright
86
holder or contributors be liable for any direct, indirect, incidental,
87
special, exemplary, or consequential damages (including, but not
88
limited to, procurement of substitute goods or services; loss of use,
89
data, or profits; or business interruption) however caused and on any
90
theory of liability, whether in contract, strict liability, or tort
91
(including negligence or otherwise) arising in any way out of the use
92
of this software, even if advised of the possibility of such damage.
93
</p>
94
         <p>Author: See AUTHORS</p>
95
         <p>Id: $Id: docxtotei.xsl 10200 2012-03-30 15:45:18Z rahtz $</p>
96
         <p>Copyright: 2008, TEI Consortium</p>
97
      </desc>
98
   </doc>
99

    
100
	  <xsl:variable name="processor">
101
		    <xsl:value-of select="system-property('xsl:vendor')"/>
102
	  </xsl:variable>
103
	  <xsl:variable name="digits">1234567890</xsl:variable>
104
	  <xsl:variable name="characters">~!@#$%^&amp;*()&lt;&gt;{}[]|:;,.?`'"=+-_</xsl:variable>
105

    
106

    
107
	  <xsl:variable name="wordDirectory">
108
	    <xsl:value-of
109
		select="translate($word-directory,'\\','/')"/>
110
	  </xsl:variable>
111
	  <xsl:variable name="docProps" select="doc(concat($wordDirectory,'/docProps/core.xml'))"/>
112
	  <xsl:variable name="styledoc" select="doc(concat($wordDirectory,'/word/styles.xml'))"/>
113

    
114
	<xsl:strip-space elements="*"/>
115
	  <xsl:preserve-space elements="w:t"/>
116
	  <xsl:output method="xml" version="1.0" encoding="UTF-8" indent="yes"/>
117

    
118
	  <doc xmlns="http://www.oxygenxml.com/ns/doc/xsl">
119
      <desc>
120
         <p>The main template that starts the conversion from docx to TEI</p>
121
	 <p><b>IMPORTING STYLESHEETS AND OVERRIDING MATCHED TEMPLATES:</b></p>
122
		
123
	<p>When importing a stylesheet (xsl:import) all the templates
124
	in the imported stylesheet get a lower import-precedence than
125
	the ones in the importing stylesheet. If the importing
126
	stylesheet wants to override, let's say a general template to
127
	match all &lt;w:p&gt; elements where no more specialized rule
128
	applies it can't, since it will automatically override all
129
	w:p[somepredicate] template in the imported stylesheet as
130
	well.  In this case we have outsourced the processing of the
131
	general template into a named template and all the imported
132
	stylesheet does is to call the named template. Now, the
133
	importing stylesheet can simply override the named template,
134
	and everything works out fine.</p>
135
		
136
	<p>See templates: - w:p (mode: paragraph)</p>
137
	
138
	<p>Modes:</p>
139
	<ul>
140
	  <li>pass0: a normalization process for styles. Can also
141
	  detect illegal styles.</li>
142
	  
143
	  <li>pass2: templates that apply in the second stage
144
	  of the conversion, cleaning TEI elements created in the
145
	  first ise."</li>
146
	  
147
	  <li>inSectionGroup: defines a template that works on a
148
	  group of consecutive elements (w:p or w:tbl elements) that
149
	  form a section (a normal section, not to be confused with
150
	  w:sectPr).</li>
151
	  
152
	  <li>paragraph: defines that the template
153
	  works on an individual element (usually
154
	  starting with a w:p element).  </li>
155
	  
156
	  <li>iden: simply copies the content</li>
157
	</ul>
158
	
159
      </desc>
160
   </doc>
161
   <xsl:template match="/">
162
     <!-- Do an initial normalization and store everything in $pass0 -->
163
     <xsl:variable name="pass0">
164
       <xsl:apply-templates mode="pass0"/>
165
     </xsl:variable>
166
     
167
     <!-- Do the main transformation and store everything in the variable pass1 -->
168
     <xsl:variable name="pass1">
169
       <xsl:for-each select="$pass0">
170
	 <xsl:apply-templates/>
171
       </xsl:for-each>
172
     </xsl:variable>		  
173
     <!-- debug
174
	 <xsl:result-document href="/tmp/foo.xml">
175
	 <xsl:copy-of select="$pass1"/>
176
	 </xsl:result-document>
177
     -->
178
     <!-- Do the final parse and create valid TEI -->
179

    
180
     <xsl:apply-templates select="$pass1" mode="pass2"/>
181
     
182
     <xsl:call-template name="fromDocxFinalHook"/>
183
   </xsl:template>
184
   
185
   <xsl:template name="fromDocxFinalHook"/>
186
   
187
   <doc xmlns="http://www.oxygenxml.com/ns/doc/xsl">
188
     <desc>
189
		Main document template
190
     </desc>
191
   </doc>
192
	  <xsl:template match="w:document">
193
	    <TEI>
194
	      <!-- create teiHeader -->
195
	      <xsl:call-template name="create-tei-header"/>
196
	      
197
	      <!-- convert main and back matter -->
198
	      <xsl:apply-templates select="w:body"/>
199
	    </TEI>
200
	  </xsl:template>
201

    
202

    
203
	  <doc xmlns="http://www.oxygenxml.com/ns/doc/xsl">
204
	    <desc>
205
	      Create the basic text; worry later about dividing it up
206
	    </desc>
207
	  </doc>
208
	  <xsl:template match="w:body">
209
	    <text>
210
	      <!-- Create forme work -->
211
	      <xsl:call-template name="extract-forme-work"/>
212
	      
213
	      <!-- create TEI body -->
214
	      <body>
215
		<xsl:call-template name="mainProcess"/>
216
	      </body>
217
	    </text>
218
	  </xsl:template>
219

    
220
	  
221
	  <doc xmlns="http://www.oxygenxml.com/ns/doc/xsl">
222
	    <desc>
223
	      Process the text by high-level divisions
224
	    </desc>
225
	  </doc>
226
	  <xsl:template name="mainProcess">
227
	    <!-- 
228
		 group all paragraphs that form a first level section.
229
	    -->
230
	    <xsl:for-each-group select="w:sdt|w:p|w:tbl"
231
				group-starting-with="w:p[tei:is-firstlevel-heading(.)]">
232
	      
233
	      <xsl:choose>
234
		
235
		<!-- We are dealing with a first level section, we now have
236
		     to further divide the section into subsections that we can then
237
		     finally work on -->
238
		
239
		<xsl:when test="tei:is-heading(.)">
240
		  <xsl:call-template name="group-by-section"/>
241
		</xsl:when>
242
		
243
		<!-- We have found some loose paragraphs. These are most probably
244
		     front matter paragraps. We can simply convert them without further
245
		     trying to split them up into sub sections. -->
246
		<xsl:otherwise>
247
		  <xsl:apply-templates select="." mode="inSectionGroup"/>
248
		</xsl:otherwise>
249
	      </xsl:choose>
250
	    </xsl:for-each-group>
251
	    
252
	    <!-- I have no idea why I need this, but I apparently do. 
253
		 //TODO: find out what is going on-->
254
	    <xsl:apply-templates select="w:sectPr" mode="paragraph"/>
255
	  </xsl:template>
256
	  
257
	  <doc xmlns="http://www.oxygenxml.com/ns/doc/xsl">
258
	    <desc>
259
	      <p>Bookmarks in section mode</p>
260
	      <p>
261
		There are certain elements that we don't really care about, but that
262
		force us to regroup everything from the next sibling on.
263
		
264
		@see grouping in construction of headline outline.
265
	      </p>
266
	    </desc>
267
	  </doc>
268
	  <xsl:template match="w:bookmarkStart|w:bookmarkEnd"
269
			mode="inSectionGroup">
270
	    <xsl:for-each-group select="current-group() except ." group-adjacent="1">
271
	      <xsl:apply-templates select="." mode="inSectionGroup"/>
272
	    </xsl:for-each-group>
273
	  </xsl:template>
274

    
275

    
276
	  <doc xmlns="http://www.oxygenxml.com/ns/doc/xsl">
277
	    <desc>
278
	      <p>Bookmarks in normal mode</p>
279
	      <p>Copy bookmarks for processing in pass 2</p>
280
	    </desc>
281
	  </doc>
282
	  <xsl:template match="w:bookmarkStart|w:bookmarkEnd" >
283
	    <xsl:choose>
284
	    <xsl:when test="starts-with(@w:name,'_Ref')">
285
	      <xsl:copy-of select="."/>
286
	    </xsl:when>
287
	    <xsl:when test="self::w:bookmarkEnd"/>
288
	    <xsl:otherwise>
289
	      <ANCHOR>
290
		<xsl:attribute name="xml:id" select="substring(@w:name,2)"/>
291
	      </ANCHOR>
292
	    </xsl:otherwise>
293
	    </xsl:choose>
294
	  </xsl:template>
295
	  
296
	  <doc xmlns="http://www.oxygenxml.com/ns/doc/xsl">
297
      <desc>
298
         <p>Grouping consecutive elements that belong together</p>
299
         <p>
300
		We are now working on a group of all elements inside some group bounded by
301
		headings. These need to be further split up into smaller groups for figures,
302
		list etc. and into individual groups for simple paragraphs...
303
		</p>
304
      </desc>
305
	  </doc>
306
	  <xsl:template match="w:tbl|w:p" mode="inSectionGroup">
307
	    
308
	    <!-- 
309
		 We are looking for:
310
		 - Lists -> 1
311
		 - Table of Contents -> 2
312
		 - Figures -> 3
313
		 
314
		 Anything else is assigned a number of position()+100. This should be
315
		 sufficient even if we find lots more things to group.
316
	    -->
317
	    <xsl:for-each-group 
318
		select="current-group()"
319
		group-adjacent="if       (tei:is-list(.))  then 1
320
				else  if (tei:is-toc(.))   then 2
321
				else  if (tei:is-figure(.)) then 3
322
				else  if (tei:is-line(.)) then 4
323
				else position() + 100">
324
	      
325
	      <!-- For each defined grouping call a specific template. If there is no
326
		   grouping defined, apply templates with mode
327
		   paragraph -->
328
	      <xsl:choose>
329
		<xsl:when test="current-grouping-key()=1">
330
		  <xsl:call-template name="listSection"/>
331
		</xsl:when>
332
		<xsl:when test="current-grouping-key()=2">
333
		  <xsl:call-template name="tocSection"/>
334
		</xsl:when>
335
		<xsl:when test="current-grouping-key()=3">
336
		  <xsl:call-template name="figureSection"/>
337
		</xsl:when>
338
		<xsl:when test="current-grouping-key()=4">
339
		  <xsl:call-template name="lineSection"/>
340
		</xsl:when>
341
		<!-- it is not a defined grouping .. apply templates -->
342
		<xsl:otherwise>
343
		  <xsl:apply-templates select="." mode="paragraph"/>
344
		</xsl:otherwise>
345
	      </xsl:choose>
346
	    </xsl:for-each-group>
347
	  </xsl:template>
348

    
349
    <doc xmlns="http://www.oxygenxml.com/ns/doc/xsl">
350
      <desc>
351
	Creating a group of a figure
352
      </desc>
353
   </doc>
354
    <xsl:template name="figureSection">
355
      <figure>
356
	<xsl:for-each select="current-group()">
357
	  <xsl:apply-templates select="." mode="paragraph"/>
358
	</xsl:for-each>
359
      </figure>
360
    </xsl:template>
361

    
362
    <doc xmlns="http://www.oxygenxml.com/ns/doc/xsl">
363
      <desc>
364
	Creating a gloss list
365
      </desc>
366
   </doc>
367

    
368
    <xsl:template name="glossListSection">
369
      <list type="gloss">
370
	<xsl:for-each select="current-group()">
371
	  <xsl:apply-templates select="." mode="paragraph"/>
372
	</xsl:for-each>
373
      </list>
374
    </xsl:template>
375

    
376
    <doc xmlns="http://www.oxygenxml.com/ns/doc/xsl">
377
      <desc>
378
	Creating a group of a figure
379
      </desc>
380
   </doc>
381
    <xsl:template name="lineSection">
382
      <lg>
383
	<xsl:for-each select="current-group()">
384
	  <xsl:apply-templates select="." mode="paragraph"/>
385
	</xsl:for-each>
386
      </lg>
387
    </xsl:template>
388

    
389
    <doc xmlns="http://www.oxygenxml.com/ns/doc/xsl">
390
      <desc>
391
	Groups the document by headings and thereby creating the document structure. 
392
      </desc>
393
   </doc>
394
   <xsl:template name="group-by-section">
395
     <xsl:variable name="Style" select="w:pPr/w:pStyle/@w:val"/>
396
     <xsl:variable name="NextHeader" select="tei:get-nextlevel-header($Style)"/>
397
     <div>
398
       <!-- generate the head -->
399
       <xsl:call-template name="generate-section-heading">
400
	 <xsl:with-param name="Style" select="$Style"/>
401
       </xsl:call-template>
402
       <!-- Process sub-sections -->
403
       <xsl:for-each-group select="current-group() except ."
404
			   group-starting-with="w:p[w:pPr/w:pStyle/@w:val=$NextHeader]">
405
	 <xsl:choose>
406
	   <xsl:when test="tei:is-heading(.)">
407
	     <xsl:call-template name="group-by-section"/>
408
	   </xsl:when>
409
	   <xsl:otherwise>
410
	     <xsl:apply-templates select="." mode="inSectionGroup"/>
411
	   </xsl:otherwise>
412
	 </xsl:choose>
413
       </xsl:for-each-group>
414
     </div>
415
   </xsl:template>
416
   
417

    
418
   <doc xmlns="http://www.oxygenxml.com/ns/doc/xsl">
419
     <desc>
420
       <p>Looks through the document to find forme work related sections.</p>
421
       <p>
422
	 Creates a &lt;fw&gt; element for each forme work related section. These include
423
	 running headers and footers. The corresponding elements in OOXML are w:headerReference
424
	 and w:footerReference. These elements only define a reference that to a header or
425
	 footer definition file. The reference itself is resolved in the file word/_rels/document.xml.rels.
426
       </p>
427
     </desc>
428
   </doc>
429
	  <xsl:template name="extract-forme-work">
430
	    <xsl:if test="preserveWordHeadersFooters='true'">
431
		    <xsl:for-each-group select="//w:headerReference|//w:footerReference" group-by="@r:id">
432
			      <fw>
433
				        <xsl:attribute name="xml:id">
434
					          <xsl:value-of select="@r:id"/>
435
				        </xsl:attribute>
436
				        <xsl:attribute name="type">
437
					          <xsl:choose>
438
						            <xsl:when test="self::w:headerReference">header</xsl:when>
439
						            <xsl:otherwise>footer</xsl:otherwise>
440
					          </xsl:choose>
441
				        </xsl:attribute>
442

    
443
				        <xsl:variable name="rid" select="@r:id"/>
444
				        <xsl:variable name="h-file">
445
					          <xsl:value-of select="document(concat($wordDirectory,'/word/_rels/document.xml.rels'))//rel:Relationship[@Id=$rid]/@Target"/>
446
				        </xsl:variable>
447

    
448
				        <!-- for the moment, just copy content -->
449
				<xsl:if test="doc-available(concat($wordDirectory,'/word/', $h-file))">
450
					          <xsl:for-each-group select="document(concat($wordDirectory,'/word/', $h-file))/*[1]/w:*"
451
                                   group-adjacent="1">
452
						            <xsl:apply-templates select="." mode="inSectionGroup"/>
453
					          </xsl:for-each-group>
454
				        </xsl:if>
455

    
456
			      </fw>
457
		    </xsl:for-each-group>
458
	    </xsl:if>
459
	  </xsl:template>
460

    
461
   <xsl:template match="w:hyperlink">
462
      <ref>
463
	<xsl:variable name="rid" select="@r:id"/>
464
	<xsl:attribute name="target">
465
	  <xsl:choose>
466
	    <xsl:when test="ancestor::w:endnote">
467
	      <xsl:value-of
468
		  select="document(concat($wordDirectory,'/word/_rels/endnotes.xml.rels'))//rel:Relationship[@Id=$rid]/@Target"/>
469
	    </xsl:when>
470
	    <xsl:otherwise>
471
	      <xsl:value-of
472
		  select="document(concat($wordDirectory,'/word/_rels/document.xml.rels'))//rel:Relationship[@Id=$rid]/@Target"/>
473
	    </xsl:otherwise>
474
	  </xsl:choose>
475
	</xsl:attribute>
476
	<xsl:apply-templates/>
477
      </ref>
478
   </xsl:template>
479

    
480
   <xsl:template match="w:instrText">
481
      <xsl:choose>
482
         <xsl:when test="contains(.,'REF _')"></xsl:when>
483
         <xsl:when test="starts-with(.,'HYPERLINK')"></xsl:when>
484
         <xsl:when test="starts-with(.,' XE')"></xsl:when>
485
         <xsl:when test="starts-with(.,'XE')"></xsl:when>
486
         <xsl:otherwise>
487
            <xsl:value-of select="."/>
488
         </xsl:otherwise>
489
      </xsl:choose>
490
   </xsl:template>
491

    
492
    <xsl:template match="@rend[.='Body_Text']" mode="pass2"/>
493

    
494
    <xsl:template match="@rend[.='Normal (Web)']" mode="pass2"/>
495

    
496
</xsl:stylesheet>