root / tmp / org.txm.core / src / java / org / txm / scripts / importer / HTMLIndexer.groovy @ 2473
History | View | Annotate | Download (4.6 kB)
1 | 881 | mdecorde | // Copyright © 2010-2013 ENS de Lyon.
|
---|---|---|---|
2 | 881 | mdecorde | // Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
|
3 | 881 | mdecorde | // Lyon 2, University of Franche-Comté, University of Nice
|
4 | 881 | mdecorde | // Sophia Antipolis, University of Paris 3.
|
5 | 881 | mdecorde | //
|
6 | 881 | mdecorde | // The TXM platform is free software: you can redistribute it
|
7 | 881 | mdecorde | // and/or modify it under the terms of the GNU General Public
|
8 | 881 | mdecorde | // License as published by the Free Software Foundation,
|
9 | 881 | mdecorde | // either version 2 of the License, or (at your option) any
|
10 | 881 | mdecorde | // later version.
|
11 | 881 | mdecorde | //
|
12 | 881 | mdecorde | // The TXM platform is distributed in the hope that it will be
|
13 | 881 | mdecorde | // useful, but WITHOUT ANY WARRANTY; without even the implied
|
14 | 881 | mdecorde | // warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
15 | 881 | mdecorde | // PURPOSE. See the GNU General Public License for more
|
16 | 881 | mdecorde | // details.
|
17 | 881 | mdecorde | //
|
18 | 881 | mdecorde | // You should have received a copy of the GNU General
|
19 | 881 | mdecorde | // Public License along with the TXM platform. If not, see
|
20 | 881 | mdecorde | // http://www.gnu.org/licenses.
|
21 | 881 | mdecorde | //
|
22 | 881 | mdecorde | //
|
23 | 881 | mdecorde | //
|
24 | 881 | mdecorde | // $LastChangedDate:$
|
25 | 881 | mdecorde | // $LastChangedRevision:$
|
26 | 881 | mdecorde | // $LastChangedBy:$
|
27 | 881 | mdecorde | //
|
28 | 1000 | mdecorde | package org.txm.scripts.importer
|
29 | 881 | mdecorde | |
30 | 881 | mdecorde | import java.util.Collections; |
31 | 881 | mdecorde | import java.io.File; |
32 | 881 | mdecorde | import org.txm.utils.io.FileCopy; |
33 | 881 | mdecorde | import org.txm.utils.*; |
34 | 881 | mdecorde | import java.io.File; |
35 | 881 | mdecorde | import java.util.HashMap; |
36 | 881 | mdecorde | import java.util.ArrayList; |
37 | 881 | mdecorde | import java.io.File; |
38 | 881 | mdecorde | import javax.xml.stream.*; |
39 | 881 | mdecorde | import java.net.URL; |
40 | 881 | mdecorde | |
41 | 881 | mdecorde | // TODO: Auto-generated Javadoc
|
42 | 881 | mdecorde | /**
|
43 | 881 | mdecorde | * The Class HTMLIndexer.
|
44 | 881 | mdecorde | */
|
45 | 881 | mdecorde | class HTMLIndexer { |
46 | 881 | mdecorde | |
47 | 881 | mdecorde | /** The idxprefix. */
|
48 | 881 | mdecorde | static String idxprefix = "IDX-"; |
49 | 881 | mdecorde | |
50 | 881 | mdecorde | /** The index. */
|
51 | 881 | mdecorde | HashMap<String, ArrayList<String>> index = new HashMap<String, ArrayList<String>>(); |
52 | 881 | mdecorde | |
53 | 881 | mdecorde | /**
|
54 | 881 | mdecorde | * Process html dir.
|
55 | 881 | mdecorde | *
|
56 | 881 | mdecorde | * @param htmlDirectory the html directory
|
57 | 881 | mdecorde | * @return true, if successful
|
58 | 881 | mdecorde | */
|
59 | 881 | mdecorde | private boolean processHTMLDir(File htmlDirectory) |
60 | 881 | mdecorde | { |
61 | 881 | mdecorde | ArrayList<File> htmlfiles = DeleteDir.scanDirectory(htmlDirectory, true) |
62 | 881 | mdecorde | Collections.sort(htmlfiles);
|
63 | 881 | mdecorde | |
64 | 881 | mdecorde | for(File htmlFile : htmlfiles)//get all indexes |
65 | 881 | mdecorde | { |
66 | 881 | mdecorde | if(htmlFile.getName().endsWith(".html")) |
67 | 881 | mdecorde | processHTMLFile(htmlFile); |
68 | 881 | mdecorde | } |
69 | 881 | mdecorde | |
70 | 881 | mdecorde | ArrayList<String> tokens = new ArrayList<String>(index.keySet()); |
71 | 881 | mdecorde | Collections.sort(tokens);
|
72 | 881 | mdecorde | |
73 | 881 | mdecorde | //fix doubles like étiquette&étiquettes
|
74 | 881 | mdecorde | for(int i = 0 ; i < tokens.size() ; i++) |
75 | 881 | mdecorde | { |
76 | 881 | mdecorde | String t1 = tokens.get(i);
|
77 | 881 | mdecorde | String t2 = tokens.get(i+1); |
78 | 881 | mdecorde | if(t1.equals(t2.substring(0, t2.length() -1))) |
79 | 881 | mdecorde | { |
80 | 881 | mdecorde | tokens.remove(i+1);
|
81 | 881 | mdecorde | index.get(t1).addAll(index.get(t2)); |
82 | 881 | mdecorde | //i--;
|
83 | 881 | mdecorde | } |
84 | 881 | mdecorde | } |
85 | 881 | mdecorde | |
86 | 881 | mdecorde | tokens = new ArrayList<String>(index.keySet()); |
87 | 881 | mdecorde | for(String token : tokens) |
88 | 881 | mdecorde | { |
89 | 881 | mdecorde | println("Token: "+token);
|
90 | 881 | mdecorde | println(index.get(token)); |
91 | 881 | mdecorde | } |
92 | 881 | mdecorde | } |
93 | 881 | mdecorde | |
94 | 881 | mdecorde | /**
|
95 | 881 | mdecorde | * Process html file.
|
96 | 881 | mdecorde | *
|
97 | 881 | mdecorde | * @param htmlFile the html file
|
98 | 881 | mdecorde | * @return true, if successful
|
99 | 881 | mdecorde | */
|
100 | 881 | mdecorde | private boolean processHTMLFile(File htmlFile) |
101 | 881 | mdecorde | { |
102 | 881 | mdecorde | |
103 | 881 | mdecorde | |
104 | 881 | mdecorde | String lasttoken;
|
105 | 881 | mdecorde | String page;
|
106 | 881 | mdecorde | |
107 | 1688 | mdecorde | def inputData = null; |
108 | 1688 | mdecorde | def factory = null; |
109 | 1688 | mdecorde | |
110 | 881 | mdecorde | try
|
111 | 881 | mdecorde | { |
112 | 881 | mdecorde | URL url = htmlFile.toURI().toURL();
|
113 | 881 | mdecorde | println "process html file "+url;
|
114 | 1688 | mdecorde | inputData = url.openStream(); |
115 | 1688 | mdecorde | factory = XMLInputFactory.newInstance(); |
116 | 881 | mdecorde | //factory.setXMLResolver resolver
|
117 | 881 | mdecorde | //factory.setProperty(factory.IS_VALIDATING, false)
|
118 | 881 | mdecorde | factory.setProperty("javax.xml.stream.supportDTD", false); |
119 | 881 | mdecorde | factory.setProperty("javax.xml.stream.isReplacingEntityReferences", false); |
120 | 881 | mdecorde | |
121 | 881 | mdecorde | XMLStreamReader parser = factory.createXMLStreamReader(inputData); |
122 | 881 | mdecorde | for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) |
123 | 881 | mdecorde | { |
124 | 881 | mdecorde | //println "parse"
|
125 | 881 | mdecorde | switch (event)
|
126 | 881 | mdecorde | { |
127 | 881 | mdecorde | case XMLStreamConstants.START_ELEMENT:
|
128 | 881 | mdecorde | //println "elem "+parser.getLocalName()
|
129 | 881 | mdecorde | if(parser.getLocalName() == "div") |
130 | 881 | mdecorde | if(parser.getAttributeValue(null, "id") != null && parser.getAttributeValue(null, "id").startsWith("index-body")) |
131 | 881 | mdecorde | page = parser.getAttributeValue(null, "id") |
132 | 881 | mdecorde | |
133 | 881 | mdecorde | String id = parser.getAttributeValue(null, "id"); |
134 | 881 | mdecorde | if(id != null && id.startsWith(idxprefix)) |
135 | 881 | mdecorde | { |
136 | 881 | mdecorde | if(!index.containsKey(lasttoken))
|
137 | 881 | mdecorde | index.put(lasttoken, new ArrayList<String>()); |
138 | 881 | mdecorde | index.get(lasttoken).add(htmlFile.getName()+"#"+id)
|
139 | 881 | mdecorde | } |
140 | 881 | mdecorde | |
141 | 881 | mdecorde | break;
|
142 | 881 | mdecorde | |
143 | 881 | mdecorde | case XMLStreamConstants.CHARACTERS:
|
144 | 881 | mdecorde | String text = parser.getText().trim();
|
145 | 881 | mdecorde | if(text.length() > 0) |
146 | 881 | mdecorde | { |
147 | 881 | mdecorde | def texts = text.split(" "); |
148 | 881 | mdecorde | lasttoken = texts[texts.size()-1];
|
149 | 881 | mdecorde | if(lasttoken.endsWith(".")) |
150 | 881 | mdecorde | lasttoken = lasttoken.substring(0, lasttoken.length() -1) |
151 | 881 | mdecorde | } |
152 | 881 | mdecorde | } |
153 | 881 | mdecorde | } |
154 | 881 | mdecorde | |
155 | 881 | mdecorde | } |
156 | 1688 | mdecorde | catch(Exception e){println("File "+htmlFile+"\n"+e); |
157 | 1688 | mdecorde | if (parser != null) parser.close(); |
158 | 1688 | mdecorde | if (inputData != null) inputData.close(); |
159 | 1688 | mdecorde | return false;} |
160 | 1688 | mdecorde | |
161 | 1688 | mdecorde | if (parser != null) parser.close(); |
162 | 1688 | mdecorde | if (inputData != null) inputData.close(); |
163 | 881 | mdecorde | return true; |
164 | 881 | mdecorde | } |
165 | 881 | mdecorde | |
166 | 881 | mdecorde | /**
|
167 | 881 | mdecorde | * The main method.
|
168 | 881 | mdecorde | *
|
169 | 881 | mdecorde | * @param args the arguments
|
170 | 881 | mdecorde | */
|
171 | 881 | mdecorde | public static void main(String[] args) |
172 | 881 | mdecorde | { |
173 | 881 | mdecorde | File htmlDirectory = new File("/home/mdecorde/xml/html") |
174 | 881 | mdecorde | new HTMLIndexer().processHTMLDir(htmlDirectory);
|
175 | 881 | mdecorde | } |
176 | 881 | mdecorde | } |