Révision 2692
tmp/org.txm.cooccurrence.core/src/org/txm/cooccurrence/core/functions/Cooccurrence.java (revision 2692) | ||
---|---|---|
173 | 173 |
/** The writer. */ |
174 | 174 |
private BufferedWriter writer; |
175 | 175 |
|
176 |
|
|
177 |
|
|
178 |
|
|
179 | 176 |
@Parameter(key = CooccurrencePreferences.QUERY_FILTER) |
180 | 177 |
protected String pCooccurentQueryFilter = "[]"; //$NON-NLS-1$ |
181 | 178 |
|
... | ... | |
255 | 252 |
|
256 | 253 |
// clear data |
257 | 254 |
try { |
255 |
this.numberOfCooccurrents = -1; |
|
258 | 256 |
this.lines.clear(); |
259 | 257 |
this.allsignaturesstr.clear(); |
260 | 258 |
this.conclines.clear(); |
... | ... | |
1117 | 1115 |
int startsearchM2 = 0; // optimisation: m2 is ordered |
1118 | 1116 |
int startsearchM3 = 0; // optimisation: m3 is ordered |
1119 | 1117 |
// time = System.currentTimeMillis(); |
1118 |
|
|
1119 |
HashMap<Integer, Integer> positionsDistances = new HashMap<>(); |
|
1120 |
|
|
1120 | 1121 |
for (Match m : m1) { // for each match = for each focus |
1121 | 1122 |
|
1122 | 1123 |
if (m.getTarget() >= 0) { // if target is set focus on target position |
... | ... | |
1165 | 1166 |
// System.out.println("positions"); |
1166 | 1167 |
// System.out.println("start: "+(start)+" end:"+n.getEnd()); |
1167 | 1168 |
for (int position = start; position <= n.getEnd(); position++) { |
1168 |
// creates the list of positions, anticontext not yet removed |
|
1169 |
positions[noOcc++] = position; |
|
1170 |
// System.out.print(" "+position); |
|
1171 |
} |
|
1172 |
|
|
1173 |
|
|
1174 |
|
|
1175 |
// find shortest distance for each positions |
|
1176 |
HashMap<Integer, Integer> positionsDistances = new HashMap<>(); |
|
1177 |
for (int position : positions) { // cooccurrent words positions |
|
1178 | 1169 |
|
1179 | 1170 |
if (o.getStart() <= position && position <= o.getEnd()) { |
1180 | 1171 |
// ignore positions in the anticontext positions |
... | ... | |
1182 | 1173 |
} |
1183 | 1174 |
|
1184 | 1175 |
int dist; |
1185 |
if (position <= m.getStart()) {
|
|
1186 |
dist = m.getStart() - position; |
|
1176 |
if (position < m.getStart()) { |
|
1177 |
dist = m.getStart() - position - 1;
|
|
1187 | 1178 |
} |
1188 |
else if (m.getEnd() <= position) {
|
|
1189 |
dist = position - m.getEnd(); |
|
1179 |
else if (m.getEnd() < position) { |
|
1180 |
dist = position - m.getEnd() - 1;
|
|
1190 | 1181 |
} |
1191 | 1182 |
else { // the n match is in the m match !? |
1192 |
System.out.println("Warning: the n match is in the m match ? " + n + " " + m);
|
|
1183 |
System.out.println("Warning: the n match is in the m match ? " + n + " " + m); |
|
1193 | 1184 |
dist = 0; |
1194 | 1185 |
} |
1195 | 1186 |
if (!positionsDistances.containsKey(position) || positionsDistances.get(position) > dist) { |
... | ... | |
1197 | 1188 |
} |
1198 | 1189 |
} |
1199 | 1190 |
|
1200 |
// store and count distances for each position signature |
|
1201 |
noOcc = 0; |
|
1202 |
for (int position : positionsDistances.keySet()) { // cooccurrent words positions |
|
1203 |
// String signature = allsignatures.get(position); |
|
1204 |
String signaturestr = allsignaturesstr.get(position); |
|
1205 |
|
|
1206 |
if (!distances.containsKey(signaturestr)) { |
|
1207 |
distances.put(signaturestr, 0.0); |
|
1208 |
} |
|
1209 |
|
|
1210 |
if (!counts.containsKey(signaturestr)) { |
|
1211 |
counts.put(signaturestr, 0); |
|
1212 |
} |
|
1213 |
|
|
1214 |
int dist = positionsDistances.get(position); |
|
1191 |
// System.out.println("nb Occ ignored: "+ignore); |
|
1192 |
// System.out.println("nb Occ chevauche: "+chevauche); |
|
1193 |
} |
|
1194 |
|
|
1195 |
// store and count distances for each position signature |
|
1196 |
int noOcc = 0; |
|
1197 |
for (int position : positionsDistances.keySet()) { // cooccurrent words positions |
|
1198 |
// String signature = allsignatures.get(position); |
|
1199 |
String signaturestr = allsignaturesstr.get(position); |
|
1200 |
|
|
1201 |
int dist = positionsDistances.get(position); |
|
1202 |
if (distances.containsKey(signaturestr)) { |
|
1215 | 1203 |
distances.put(signaturestr, (distances.get(signaturestr)) + dist); |
1204 |
} |
|
1205 |
else { |
|
1206 |
distances.put(signaturestr, 0.0); |
|
1207 |
} |
|
1208 |
|
|
1209 |
if (counts.containsKey(signaturestr)) { |
|
1216 | 1210 |
counts.put(signaturestr, (counts.get(signaturestr)) + 1); |
1217 |
// } |
|
1218 |
|
|
1219 |
noOcc++; |
|
1220 | 1211 |
} |
1212 |
else { |
|
1213 |
counts.put(signaturestr, 1); |
|
1214 |
} |
|
1221 | 1215 |
|
1222 |
// System.out.println("nb Occ ignored: "+ignore); |
|
1223 |
// System.out.println("nb Occ chevauche: "+chevauche); |
|
1216 |
if ("[1599]".equals(signaturestr)) { |
|
1217 |
System.out.println("p=" + position + " d=" + dist + " total(d)=" + distances.get(signaturestr) + " c=" + counts.get(signaturestr)); |
|
1218 |
} |
|
1219 |
// } |
|
1220 |
|
|
1221 |
noOcc++; |
|
1224 | 1222 |
} |
1225 | 1223 |
// System.out.println("T counts : "+(System.currentTimeMillis()- time)); //$NON-NLS-1$ |
1226 | 1224 |
|
... | ... | |
1327 | 1325 |
CLine cline = new CLine(this, specifrownames[ii], props, |
1328 | 1326 |
counts.get(signaturestr), // cofreq |
1329 | 1327 |
indexfreqs.get(specifrownames[ii]), scores[ii][1], // freq |
1330 |
((float) (distances.get(signaturestr) / counts.get(signaturestr))) - 1.0f, // mean distance
|
|
1328 |
((float) (distances.get(signaturestr) / counts.get(signaturestr))), // mean distance |
|
1331 | 1329 |
-1); |
1332 | 1330 |
|
1333 | 1331 |
// select the line |
Formats disponibles : Unified diff