Changeset 685224
- Timestamp:
- 03/21/2013 02:30:23 PM (13 years ago)
- Location:
- summy/trunk/lib/Summy
- Files:
-
- 6 edited
-
Core.php (modified) (9 diffs)
-
Filter/En/Stemmer.php (modified) (2 diffs)
-
Filter/En/Text.php (modified) (2 diffs)
-
Filter/Gr/Stemmer.php (modified) (25 diffs)
-
Score/Position.php (modified) (4 diffs)
-
Score/Term.php (modified) (7 diffs)
Legend:
- Unmodified
- Added
- Removed
-
summy/trunk/lib/Summy/Core.php
r664156 r685224 3 3 /** 4 4 * @package Summy 5 * @version $Id: Core.php 1 05 2013-02-05 00:44:51Z Tefra $5 * @version $Id: Core.php 128 2013-03-17 23:44:07Z Tefra $ 6 6 * @author Christodoulos Tsoulloftas 7 7 * @copyright Copyright 2011-2013, http://www.komposta.net … … 155 155 'language' => 'gr', 156 156 'termScore' => 'tfisf', 157 'positionScore' => ' news',157 'positionScore' => 'article', 158 158 'minWordsLimit' => 6, 159 159 'maxWordsLimit' => 20, … … 187 187 $text = $textFilter->clear($text); 188 188 $text = $textFilter->process($text); 189 $terms = array_ unique(explode(" ", $text));189 $terms = array_filter(explode(" ", $text)); 190 190 foreach($terms AS $term) 191 191 { … … 193 193 { 194 194 $term = $filter->filter($term); 195 if($term === false) 196 { 197 break; 198 } 195 199 } 196 200 … … 238 242 239 243 $i = 0; 240 $total = ceil(($this->config['rate'] / 100) * $this->totalSentences);244 $total = ceil(($this->config['rate'] / 100) * $this->totalSentences); 241 245 $indexes = array_keys($this->sentenceScores); 242 246 //Grab the top x sentences … … 258 262 259 263 /** 260 * Process sente ces, words and produce the core statistics264 * Process sentences, words and produce the core statistics 261 265 * - Loop through sentences 262 266 * - Extra text filtering … … 355 359 356 360 /** 357 * Fetchthe summary body361 * Returns the summary body 358 362 * @return string 359 363 */ … … 365 369 366 370 /** 367 * Fetchthe original text with the summary sentences highlighted371 * Returns the original text with the summary sentences highlighted 368 372 * @return string 369 373 */ … … 386 390 /** 387 391 * 388 * @var type 392 * @var type 389 393 */ 390 394 static $instances = array(); 391 395 392 396 /** 393 * 397 * 394 398 * @return type 395 399 */ -
summy/trunk/lib/Summy/Filter/En/Stemmer.php
r664156 r685224 7 7 * 8 8 * @package Summy 9 * @version $Id: Stemmer.php 88 2013-02-04 13:25:53Z Tefra $9 * @version $Id: Stemmer.php 128 2013-03-17 23:44:07Z Tefra $ 10 10 * @author Christodoulos Tsoulloftas 11 11 * @copyright Copyright 2011-2013, http://www.komposta.net … … 145 145 break; 146 146 147 case ' S':147 case 'T': 148 148 self::replace($word, 'BILITI', 'BLE', 0) OR self::replace($word, 'ALITI', 'AL', 0) OR self::replace($word, 'IVITI', 'IVE', 0); 149 149 break; -
summy/trunk/lib/Summy/Filter/En/Text.php
r664156 r685224 3 3 /** 4 4 * @package Summy 5 * @version $Id: Text.php 1 03 2013-02-04 21:56:04Z Tefra $5 * @version $Id: Text.php 128 2013-03-17 23:44:07Z Tefra $ 6 6 * @author Christodoulos Tsoulloftas 7 7 * @copyright Copyright 2011-2013, http://www.komposta.net … … 29 29 return $string; 30 30 } 31 31 32 32 // Make Paragraphs from html 33 33 $string = str_replace('</p>', "\n", $string); -
summy/trunk/lib/Summy/Filter/Gr/Stemmer.php
r664156 r685224 19 19 * 20 20 * @package Summy 21 * @version $Id: Stemmer.php 88 2013-02-04 13:25:53Z Tefra $21 * @version $Id: Stemmer.php 112 2013-03-06 21:56:52Z Tefra $ 22 22 * @author Christodoulos Tsoulloftas 23 23 * @copyright Copyright 2011-2013, http://www.komposta.net … … 119 119 120 120 //Step1 121 if(preg_match($this->step1regexp, $w)) 122 { 123 preg_match($this->step1regexp, $w, $fp); 121 if(preg_match($this->step1regexp, $w, $fp)) 122 { 124 123 $stem = $fp[1]; 125 124 $suffix = $fp[2]; … … 130 129 // Step 2a 131 130 $re = '/^(.+?)(ΑΔΕΣ|ΑΔΩΝ)$/u'; 132 if(preg_match($re, $w)) 133 { 134 preg_match($re, $w, $fp); 131 if(preg_match($re, $w, $fp)) 132 { 135 133 $stem = $fp[1]; 136 134 $w = $stem; … … 144 142 //Step 2b 145 143 $re = '/^(.+?)(ΕΔΕΣ|ΕΔΩΝ)$/u'; 146 if(preg_match($re, $w)) 147 { 148 preg_match($re, $w, $fp); 144 if(preg_match($re, $w, $fp)) 145 { 149 146 $stem = $fp[1]; 150 147 $w = $stem; … … 158 155 //Step 2c 159 156 $re = '/^(.+?)(ΟΥΔΕΣ|ΟΥΔΩΝ)$/u'; 160 if(preg_match($re, $w)) 161 { 162 preg_match($re, $w, $fp); 157 if(preg_match($re, $w, $fp)) 158 { 163 159 $stem = $fp[1]; 164 160 $w = $stem; … … 173 169 //Step 2d 174 170 $re = '/^(.+?)(ΕΩΣ|ΕΩΝ)$/u'; 175 if(preg_match($re, $w)) 176 { 177 preg_match($re, $w, $fp); 171 if(preg_match($re, $w, $fp)) 172 { 178 173 $stem = $fp[1]; 179 174 $w = $stem; … … 189 184 //Step 3 190 185 $re = '/^(.+?)(ΙΑ|ΙΟΥ|ΙΩΝ)$/u'; 191 if(preg_match($re, $w)) 192 { 193 preg_match($re, $w, $fp); 186 if(preg_match($re, $w, $fp)) 187 { 194 188 $stem = $fp[1]; 195 189 $w = $stem; … … 205 199 //Step 4 206 200 $re = '/^(.+?)(ΙΚΑ|ΙΚΟ|ΙΚΟΥ|ΙΚΩΝ)$/u'; 207 if(preg_match($re, $w)) 208 { 209 preg_match($re, $w, $fp); 201 if(preg_match($re, $w, $fp)) 202 { 210 203 $stem = $fp[1]; 211 204 $w = $stem; … … 228 221 } 229 222 230 if(preg_match($re2, $w)) 231 { 232 preg_match($re2, $w, $fp); 233 $stem = $fp[1]; 234 $w = $stem; 235 $test1 = false; 236 } 237 238 if(preg_match($re, $w)) 239 { 240 preg_match($re, $w, $fp); 223 if(preg_match($re2, $w, $fp)) 224 { 225 $stem = $fp[1]; 226 $w = $stem; 227 $test1 = false; 228 } 229 230 if(preg_match($re, $w, $fp)) 231 { 241 232 $stem = $fp[1]; 242 233 $w = $stem; … … 254 245 $re3 = '/^(.+?)(ΑΓΑΝΕ|ΗΣΑΝΕ|ΟΥΣΑΝΕ|ΙΟΝΤΑΝΕ|ΙΟΤΑΝΕ|ΙΟΥΝΤΑΝΕ|ΟΝΤΑΝΕ|ΟΤΑΝΕ|ΟΥΝΤΑΝΕ|ΗΚΑΝΕ|ΗΘΗΚΑΝΕ)$/u'; 255 246 256 if(preg_match($re3, $w)) 257 { 258 preg_match($re3, $w, $fp); 247 if(preg_match($re3, $w, $fp)) 248 { 259 249 $stem = $fp[1]; 260 250 $w = $stem; … … 268 258 } 269 259 270 if(preg_match($re2, $w)) 271 { 272 preg_match($re2, $w, $fp); 260 if(preg_match($re2, $w, $fp)) 261 { 273 262 $stem = $fp[1]; 274 263 $w = $stem; … … 287 276 $re4 = '/^(.+?)(ΗΣΕΤΕ)$/u'; 288 277 289 if(preg_match($re4, $w)) 290 { 291 preg_match($re4, $w, $fp); 292 $stem = $fp[1]; 293 $w = $stem; 294 $test1 = false; 295 } 296 297 if(preg_match($re3, $w)) 298 { 299 preg_match($re3, $w, $fp); 278 if(preg_match($re4, $w, $fp)) 279 { 280 $stem = $fp[1]; 281 $w = $stem; 282 $test1 = false; 283 } 284 285 if(preg_match($re3, $w, $fp)) 286 { 300 287 $stem = $fp[1]; 301 288 $w = $stem; … … 316 303 //Step 5d 317 304 $re = '/^(.+?)(ΟΝΤΑΣ|ΩΝΤΑΣ)$/u'; 318 if(preg_match($re, $w)) 319 { 320 preg_match($re, $w, $fp); 305 if(preg_match($re, $w, $fp)) 306 { 321 307 $stem = $fp[1]; 322 308 $w = $stem; … … 337 323 //Step 5e 338 324 $re = '/^(.+?)(ΟΜΑΣΤΕ|ΙΟΜΑΣΤΕ)$/u'; 339 if(preg_match($re, $w)) 340 { 341 preg_match($re, $w, $fp); 325 if(preg_match($re, $w, $fp)) 326 { 342 327 $stem = $fp[1]; 343 328 $w = $stem; … … 355 340 $re2 = '/^(.+?)(ΙΕΣΤΕ)$/u'; 356 341 357 if(preg_match($re2, $w)) 358 { 359 preg_match($re2, $w, $fp); 342 if(preg_match($re2, $w, $fp)) 343 { 360 344 $stem = $fp[1]; 361 345 $w = $stem; … … 369 353 } 370 354 371 if(preg_match($re, $w)) 372 { 373 preg_match($re, $w, $fp); 355 if(preg_match($re, $w, $fp)) 356 { 374 357 $stem = $fp[1]; 375 358 $w = $stem; … … 387 370 $re2 = '/^(.+?)(ΗΘΗΚΑ|ΗΘΗΚΕΣ|ΗΘΗΚΕ)$/u'; 388 371 389 if(preg_match($re2, $w)) 390 { 391 preg_match($re2, $w, $fp); 392 $stem = $fp[1]; 393 $w = $stem; 394 $test1 = false; 395 } 396 397 if(preg_match($re, $w)) 398 { 399 preg_match($re, $w, $fp); 372 if(preg_match($re2, $w, $fp)) 373 { 374 $stem = $fp[1]; 375 $w = $stem; 376 $test1 = false; 377 } 378 379 if(preg_match($re, $w, $fp)) 380 { 400 381 $stem = $fp[1]; 401 382 $w = $stem; … … 412 393 //Step 5h 413 394 $re = '/^(.+?)(ΟΥΣΑ|ΟΥΣΕΣ|ΟΥΣΕ)$/u'; 414 if(preg_match($re, $w)) 415 { 416 preg_match($re, $w, $fp); 395 if(preg_match($re, $w, $fp)) 396 { 417 397 $stem = $fp[1]; 418 398 $w = $stem; … … 430 410 $re = '/^(.+?)(ΑΓΑ|ΑΓΕΣ|ΑΓΕ)$/u'; 431 411 432 if(preg_match($re, $w)) 433 { 434 preg_match($re, $w, $fp); 412 if(preg_match($re, $w, $fp)) 413 { 435 414 $stem = $fp[1]; 436 415 $w = $stem; … … 450 429 //Step 5j 451 430 $re = '/^(.+?)(ΗΣΕ|ΗΣΟΥ|ΗΣΑ)$/u'; 452 if(preg_match($re, $w)) 453 { 454 preg_match($re, $w, $fp); 431 if(preg_match($re, $w, $fp)) 432 { 455 433 $stem = $fp[1]; 456 434 $w = $stem; … … 466 444 //Step 5k 467 445 $re = '/^(.+?)(ΗΣΤΕ)$/u'; 468 if(preg_match($re, $w)) 469 { 470 preg_match($re, $w, $fp); 446 if(preg_match($re, $w, $fp)) 447 { 471 448 $stem = $fp[1]; 472 449 $w = $stem; … … 482 459 //Step 5l 483 460 $re = '/^(.+?)(ΟΥΝΕ|ΗΣΟΥΝΕ|ΗΘΟΥΝΕ)$/u'; 484 if(preg_match($re, $w)) 485 { 486 preg_match($re, $w, $fp); 461 if(preg_match($re, $w, $fp)) 462 { 487 463 $stem = $fp[1]; 488 464 $w = $stem; … … 498 474 //Step 5l 499 475 $re = '/^(.+?)(ΟΥΜΕ|ΗΣΟΥΜΕ|ΗΘΟΥΜΕ)$/u'; 500 if(preg_match($re, $w)) 501 { 502 preg_match($re, $w, $fp); 476 if(preg_match($re, $w, $fp)) 477 { 503 478 $stem = $fp[1]; 504 479 $w = $stem; … … 515 490 $re = '/^(.+?)(ΜΑΤΑ|ΜΑΤΩΝ|ΜΑΤΟΣ)$/u'; 516 491 $re2 = '/^(.+?)(Α|ΑΓΑΤΕ|ΑΓΑΝ|ΑΕΙ|ΑΜΑΙ|ΑΝ|ΑΣ|ΑΣΑΙ|ΑΤΑΙ|ΑΩ|Ε|ΕΙ|ΕΙΣ|ΕΙΤΕ|ΕΣΑΙ|ΕΣ|ΕΤΑΙ|Ι|ΙΕΜΑΙ|ΙΕΜΑΣΤΕ|ΙΕΤΑΙ|ΙΕΣΑΙ|ΙΕΣΑΣΤΕ|ΙΟΜΑΣΤΑΝ|ΙΟΜΟΥΝ|ΙΟΜΟΥΝΑ|ΙΟΝΤΑΝ|ΙΟΝΤΟΥΣΑΝ|ΙΟΣΑΣΤΑΝ|ΙΟΣΑΣΤΕ|ΙΟΣΟΥΝ|ΙΟΣΟΥΝΑ|ΙΟΤΑΝ|ΙΟΥΜΑ|ΙΟΥΜΑΣΤΕ|ΙΟΥΝΤΑΙ|ΙΟΥΝΤΑΝ|Η|ΗΔΕΣ|ΗΔΩΝ|ΗΘΕΙ|ΗΘΕΙΣ|ΗΘΕΙΤΕ|ΗΘΗΚΑΤΕ|ΗΘΗΚΑΝ|ΗΘΟΥΝ|ΗΘΩ|ΗΚΑΤΕ|ΗΚΑΝ|ΗΣ|ΗΣΑΝ|ΗΣΑΤΕ|ΗΣΕΙ|ΗΣΕΣ|ΗΣΟΥΝ|ΗΣΩ|Ο|ΟΙ|ΟΜΑΙ|ΟΜΑΣΤΑΝ|ΟΜΟΥΝ|ΟΜΟΥΝΑ|ΟΝΤΑΙ|ΟΝΤΑΝ|ΟΝΤΟΥΣΑΝ|ΟΣ|ΟΣΑΣΤΑΝ|ΟΣΑΣΤΕ|ΟΣΟΥΝ|ΟΣΟΥΝΑ|ΟΤΑΝ|ΟΥ|ΟΥΜΑΙ|ΟΥΜΑΣΤΕ|ΟΥΝ|ΟΥΝΤΑΙ|ΟΥΝΤΑΝ|ΟΥΣ|ΟΥΣΑΝ|ΟΥΣΑΤΕ|Υ|ΥΣ|Ω|ΩΝ)$/u'; 517 if(preg_match($re, $w)) 518 { 519 preg_match($re, $w, $fp); 492 if(preg_match($re, $w, $fp)) 493 { 520 494 $stem = $fp[1]; 521 495 $w = $stem . "ΜΑ"; 522 496 } 523 497 524 if(preg_match($re2, $w) && $test1) 525 { 526 preg_match($re2, $w, $fp); 498 if(preg_match($re2, $w, $fp) && $test1) 499 { 527 500 $stem = $fp[1]; 528 501 $w = $stem; … … 531 504 // Step 7 (ΠΑΡΑΘΕΤΙΚΑ) 532 505 $re = '/^(.+?)(ΕΣΤΕΡ|ΕΣΤΑΤ|ΟΤΕΡ|ΟΤΑΤ|ΥΤΕΡ|ΥΤΑΤ|ΩΤΕΡ|ΩΤΑΤ)$/u'; 533 if(preg_match($re, $w)) 534 { 535 preg_match($re, $w, $fp); 506 if(preg_match($re, $w, $fp)) 507 { 536 508 $stem = $fp[1]; 537 509 $w = $stem; -
summy/trunk/lib/Summy/Score/Position.php
r664156 r685224 3 3 /** 4 4 * @package Summy 5 * @version $Id: Position.php 93 2013-02-04 18:26:25Z Tefra $5 * @version $Id: Position.php 117 2013-03-16 16:37:14Z Tefra $ 6 6 * @author Christodoulos Tsoulloftas 7 7 * @copyright Copyright 2011-2013, http://www.komposta.net … … 30 30 31 31 /** 32 * Return the Position Weight for a sentence based on the Baxendale investigation32 * Returns the Position Weight for a sentence based on the Baxendale investigation 33 33 * 34 34 * Baxendale (1958) investigated a sample of 200 paragraphs to determine … … 58 58 59 59 /** 60 * Return the Position Weight for a sentence based on the hypothesis that61 * first paragraphs/sentences are the most meaningful lto a document, which60 * Returns the Position Weight for a sentence based on the hypothesis that 61 * first paragraphs/sentences are the most meaningful to a document, which 62 62 * applies to small articles, like newspaper news. 63 63 * … … 68 68 * @return float 69 69 */ 70 public function news($totalParagraphs, $sentencesInParagraph, $paragraph, $sentence)70 public function article($totalParagraphs, $sentencesInParagraph, $paragraph, $sentence) 71 71 { 72 72 return (($totalParagraphs - $paragraph + 1) / $totalParagraphs) * (($sentencesInParagraph - $sentence + 1) / $sentencesInParagraph); -
summy/trunk/lib/Summy/Score/Term.php
r664156 r685224 3 3 /** 4 4 * @package Summy 5 * @version $Id: Term.php 93 2013-02-04 18:26:25Z Tefra $5 * @version $Id: Term.php 125 2013-03-16 17:49:26Z Tefra $ 6 6 * @author Christodoulos Tsoulloftas 7 7 * @copyright Copyright 2011-2013, http://www.komposta.net … … 16 16 /** 17 17 * Language identifier (gr,en) 18 * @var string 18 * @var string 19 19 */ 20 20 private $_language = null; … … 22 22 /** 23 23 * Database adapter 24 * @var object 24 * @var object 25 25 */ 26 26 private $_dbAdapter = null; … … 122 122 123 123 /** 124 * Rank sentences by TF-I SF http://en.wikipedia.org/wiki/Tf-idf124 * Rank sentences by TF-IDF http://en.wikipedia.org/wiki/Tf-idf 125 125 * 126 126 * @param integer $totalSentences … … 138 138 $sql = $db->query("SELECT COUNT(*) as total FROM document WHERE language = ?", array($language))->current(); 139 139 $totalDocuments = $sql['total'] + 1; 140 $terms = $db->platform->quoteValueList(array_keys($tf)); 141 $sql = $db->query("SELECT id, documents FROM term WHERE id IN ({$terms}) AND language = ?", array($language)); 142 140 $terms = implode(',', array_map(array($db->driver->getConnection()->getResource(), 'quote'), array_keys($tf))); 141 $sql = $db->query("SELECT term, documents FROM term WHERE term IN ({$terms}) AND language = ?", array($language)); 143 142 foreach($sql AS $row) 144 143 { 145 $docFreq[$row[' id']] = $row['documents'];144 $docFreq[$row['term']] = $row['documents']; 146 145 } 147 146 … … 171 170 public function tfridf($totalSentences, $tf = array(), $sf = array()) 172 171 { 173 $wordScore = $docFreq = $totFreq = array();172 $wordScore = $docFreq = $totFreq = array(); 174 173 $totalWords = array_sum($tf); 175 174 $db = $this->getAdapter(); … … 177 176 $sql = $db->query("SELECT COUNT(*) as total FROM document WHERE language = ?", array($language))->current(); 178 177 $totalDocuments = $sql['total'] + 1; 179 $terms = $db->platform->quoteValueList(array_keys($tf));180 $sql = $db->query("SELECT id, frequency, documents FROM term WHERE idIN ({$terms}) AND language = ?", array($language));178 $terms = implode(',', array_map(array($db->driver->getConnection()->getResource(), 'quote'), array_keys($tf))); 179 $sql = $db->query("SELECT term, frequency, documents FROM term WHERE term IN ({$terms}) AND language = ?", array($language)); 181 180 182 181 foreach($sql AS $row) 183 182 { 184 $docFreq[$row[' id']] = $row['documents'];185 $totFreq[$row[' id']] = $row['frequency'];183 $docFreq[$row['term']] = $row['documents']; 184 $totFreq[$row['term']] = $row['frequency']; 186 185 } 187 186 188 187 foreach($tf AS $word => $frequency) 189 188 { 190 $docFreq[$word] +=1;191 $totFreq[$word] +=$frequency;189 $docFreq[$word] = isset($docFreq[$word]) ? $docFreq[$word] + 1 : 1; 190 $totFreq[$word] = isset($totFreq[$word]) ? $totFreq[$word] + $frequency : $frequency; 192 191 // Normalized frequency 193 192 $_tf = $frequency / $totalWords;
Note: See TracChangeset
for help on using the changeset viewer.