| [ Index ] |
PHP Cross Reference of Crawler |
[Summary view] [Print] [Text view]
1 <?php 2 /** 3 * 4 * Main crawler functions 5 * 6 * @package crawler 7 * 8 */ 9 10 /** 11 * Curl and Parsing Functions 12 */ 13 14 /** 15 * cURL Function which returns HTML and page info as array 16 * 17 * @params string $url URL to cURL 18 * @return array Associative array of results 19 */ 20 function curl_page($url) { 21 $ch = curl_init($url); 22 $options = array( 23 CURLOPT_HEADER => false, 24 CURLOPT_COOKIEJAR => 'cookie.txt', 25 CURLOPT_COOKIEFILE => 'cookie.txt', 26 CURLOPT_USERAGENT => 'Mozilla/5.0 (FCC New Media Web Crawler)', 27 CURLOPT_FOLLOWLOCATION => true, 28 CURLOPT_RETURNTRANSFER => true, 29 CURLOPT_FILETIME => true, 30 CURLOPT_TIMEOUT => 10 31 ); 32 curl_setopt_array($ch, $options); 33 $output['html'] = curl_exec($ch); 34 $output['http_code'] = curl_getinfo($ch,CURLINFO_HTTP_CODE); 35 $output['reported_size'] = curl_getinfo($ch,CURLINFO_CONTENT_LENGTH_DOWNLOAD); 36 $output['actual_size'] = curl_getinfo($ch,CURLINFO_SIZE_DOWNLOAD); 37 $output['type'] = curl_getinfo($ch,CURLINFO_CONTENT_TYPE); 38 $output['modified'] = curl_getinfo($ch,CURLINFO_FILETIME); 39 curl_close($ch); 40 return $output; 41 } 42 43 /** 44 * Function to parse page for title tags 45 * 46 * @params string $data HTML of page 47 * @return string|bool title of page of null if not found 48 */ 49 function parse_title($data) { 50 if (preg_match('#<title>(.*)</title>#is',$data,$title)) return trim($title[1]); 51 else return null; 52 } 53 54 /** 55 * Function to parse page for links 56 * 57 * @params string $data HTML of target page 58 * @return array Numeric array of links on page (URLs only) 59 */ 60 function parse_links($data) { 61 $regexp = "<a\s[^>]*href=([\"|']??)([^\"' >]*?)\\1[^>]*>(.*)<\/a>"; 62 if(preg_match_all("/$regexp/siU", $data, $matches)) return $matches[2]; 63 else return array(); 64 } 65 66 /** 67 * Given a URL calculates the page's directory 68 * 69 * @params string $url target URL 70 * @return string Directory 71 */ 72 function parse_dir($url) { 73 $slash = strrpos($url,'/'); 74 return substr($url,0,$slash+1); 75 } 76 77 /** 78 * Link Checking Functions 79 */ 80 81 /** 82 * Uniformly cleans a link to avoid duplicates 83 * 84 * 1. Changes relative links to absolute (/bar to http://www.foo.com/bar) 85 * 2. Removes anchor tags (foo.html#bar to foo.html) 86 * 3. Adds trailing slash if directory (foo.com/bar to foo.com/bar/) 87 * 4. Adds www if there is not a subdomain (foo.com to www.foo.com but not bar.foo.com) 88 * 89 * @params string $link link to clean 90 * @parmas string $dir directory of parent (linking) page 91 * @return strin cleaned link 92 */ 93 function clean_link($link, $dir) { 94 $link = url_to_absolute($dir, $link); //make them absolute, not relative 95 if (stripos($link,'#') != FALSE) $link = substr($link,0,stripos($link,'#')); //remove anchors 96 if (!preg_match('#(^http://(.*)/$)|http://(.*)/(.*)\.([A-Za-z0-9]+)|http://(.*)/([^\?\#]*)(\?|\#)([^/]*)#i',$link)) $link .= '/'; 97 $link = preg_replace('#http://([^.]+).([a-zA-z]{3})/#i','http://www.$1.$2/',$link); 98 return $link; 99 } 100 101 102 /** 103 * Performs a regular expressoin to see if a given link is an image 104 * 105 * @params string $link target link 106 * @return bool true on image, false on anything else 107 */ 108 function is_image($link) { 109 if (preg_match('%\.(gif|jpe?g|png|bmp)$%i',$link)) return true; 110 else return false; 111 } 112 113 /** 114 * Checks to see that a given link is within the domain whitelist 115 * 116 * Note to self: this can be rewritten using a single regex command 117 * 118 * @params string $link target link 119 * @return bool true if out of domain, false if on domain whitelist 120 */ 121 function out_of_domain($link) { 122 global $domain_array; 123 foreach ($domain_array as $domain) { 124 if (stripos($link,trim($domain)) != FALSE) return false; 125 } 126 return true; 127 } 128 129 /** 130 * Checks to see if a given link is in fact a mailto: link 131 * 132 * @params string $link Link to check 133 * @return bool true on mailto:, false on everything else 134 */ 135 function is_mailto($link) { 136 if (stripos($link,'mailto:')===FALSE) return false; 137 else return true; 138 } 139 140 /* 141 * Data storage and retrieval functions 142 */ 143 144 /** 145 * Adds a URL to the URLs table upon discovery in a link 146 * 147 * @params string $link URL to add 148 * @params int $clicks number of clicks from initial page 149 * @return bool true on sucess, false on fail 150 */ 151 function add_url($link,$clicks) { 152 return mysql_insert('urls',array('url'=>urldecode($link),'clicks'=>$clicks)); 153 } 154 155 /** 156 * Adds a link to the links table 157 * 158 * @params int $form ID of linking page 159 * @params int $to ID of target page 160 * @return int|bool LinkID on sucess, false on fail 161 */ 162 function add_link($from,$to) { 163 if ($from == $to) return false; 164 if (mysql_exists('links',array('from'=>$from,'to'=>$to))) return false; 165 else return mysql_insert('links',array('from'=>$from,'to'=>$to)); 166 } 167 168 /** 169 * Grab all links on a given page, optionally for a specific depth 170 * 171 * @params int $pageID pageID 172 * @params int $click optionally the number of clicks from the homepage to restrict results 173 * @return array Multidimensional array keyed by target pageID with page data 174 */ 175 function get_links($pageID,$click = '') { 176 $links = mysql_array(mysql_select('links',array('from'=>$pageID)),FALSE); 177 foreach ($links as $link) $output[$link['to']] = get_page($link['to']); 178 return $output; 179 } 180 181 /** 182 * Shorthand MySQL function to count links in or out of a given page 183 * 184 * @params int $pageID subject page 185 * @params string $direction Direction to retrieve (either "to" or "from") 186 * @return int Number of links 187 */ 188 function count_links($pageID,$direction) { 189 $result = mysql_select('links',array($direction=>$pageID)); 190 return mysql_num_rows($result); 191 } 192 193 /** 194 * Shorthand MySQL function to get a particular page's row 195 * 196 * @params int $pageID target page 197 * @return array Associative array of page data 198 */ 199 function get_page($pageID) { 200 return mysql_row_array(mysql_select('urls',array('ID'=>$pageID))); 201 } 202 203 204 /** 205 * Shorthand MySQL function to to get the first 100 uncrawled URLs 206 * 207 * @return array Associative array of uncrawled URLs & page data 208 */ 209 function uncrawled_urls() { 210 return mysql_array(mysql_query("SELECT * FROM `urls` WHERE `crawled` = '0' LIMIT 100")); 211 } 212 213 /** 214 * Checks to see if a given URL is already in the pages table 215 * 216 * @params string $link URL to check 217 * @return bool true if URL exists, false if not found 218 */ 219 function have_url($url) { 220 $url = mysql_row_array(mysql_select('urls',array('url'=>urldecode($url)))); 221 if (sizeof($url)==0) return false; 222 else return $url['ID']; 223 } 224 225 /* Depreciated (I think) 226 227 function count_slashes($url) { 228 if (strlen($url)<7) return 0; 229 return substr_count($url,'/',7); 230 } 231 232 function get_slashes($url) { 233 if (preg_match_all('#/#',$url,$matches,PREG_OFFSET_CAPTURE,7)) return $matches[0]; 234 else return array(); 235 } 236 */ 237 238 /** 239 * Converts a relative URL (/bar) to an absolute URL (http://www.foo.com/bar) 240 * 241 * Inspired from code available at http://nadeausoftware.com/node/79, 242 * Code distributed under OSI BSD (http://www.opensource.org/licenses/bsd-license.php) 243 * 244 * @params string $baseUrl Directory of linking page 245 * @params string $relativeURL URL to convert to absolute 246 * @return string Absolute URL 247 */ 248 function url_to_absolute( $baseUrl, $relativeUrl ) { 249 // If relative URL has a scheme, clean path and return. 250 $r = split_url( $relativeUrl ); 251 if ( $r === FALSE ) 252 return FALSE; 253 if ( !empty( $r['scheme'] ) ) 254 { 255 if ( !empty( $r['path'] ) && $r['path'][0] == '/' ) 256 $r['path'] = url_remove_dot_segments( $r['path'] ); 257 return join_url( $r ); 258 } 259 260 // Make sure the base URL is absolute. 261 $b = split_url( $baseUrl ); 262 if ( $b === FALSE || empty( $b['scheme'] ) || empty( $b['host'] ) ) 263 return FALSE; 264 $r['scheme'] = $b['scheme']; 265 266 // If relative URL has an authority, clean path and return. 267 if ( isset( $r['host'] ) ) 268 { 269 if ( !empty( $r['path'] ) ) 270 $r['path'] = url_remove_dot_segments( $r['path'] ); 271 return join_url( $r ); 272 } 273 unset( $r['port'] ); 274 unset( $r['user'] ); 275 unset( $r['pass'] ); 276 277 // Copy base authority. 278 $r['host'] = $b['host']; 279 if ( isset( $b['port'] ) ) $r['port'] = $b['port']; 280 if ( isset( $b['user'] ) ) $r['user'] = $b['user']; 281 if ( isset( $b['pass'] ) ) $r['pass'] = $b['pass']; 282 283 // If relative URL has no path, use base path 284 if ( empty( $r['path'] ) ) 285 { 286 if ( !empty( $b['path'] ) ) 287 $r['path'] = $b['path']; 288 if ( !isset( $r['query'] ) && isset( $b['query'] ) ) 289 $r['query'] = $b['query']; 290 return join_url( $r ); 291 } 292 293 // If relative URL path doesn't start with /, merge with base path 294 if ( $r['path'][0] != '/' ) 295 { 296 $base = mb_strrchr( $b['path'], '/', TRUE, 'UTF-8' ); 297 if ( $base === FALSE ) $base = ''; 298 $r['path'] = $base . '/' . $r['path']; 299 } 300 $r['path'] = url_remove_dot_segments( $r['path'] ); 301 return join_url( $r ); 302 } 303 304 /** 305 * Required function of URL to absolute 306 * 307 * Inspired from code available at http://nadeausoftware.com/node/79, 308 * Code distributed under OSI BSD (http://www.opensource.org/licenses/bsd-license.php) 309 * 310 */ 311 function url_remove_dot_segments( $path ) { 312 // multi-byte character explode 313 $inSegs = preg_split( '!/!u', $path ); 314 $outSegs = array( ); 315 foreach ( $inSegs as $seg ) 316 { 317 if ( $seg == '' || $seg == '.') 318 continue; 319 if ( $seg == '..' ) 320 array_pop( $outSegs ); 321 else 322 array_push( $outSegs, $seg ); 323 } 324 $outPath = implode( '/', $outSegs ); 325 if ( $path[0] == '/' ) 326 $outPath = '/' . $outPath; 327 // compare last multi-byte character against '/' 328 if ( $outPath != '/' && 329 (mb_strlen($path)-1) == mb_strrpos( $path, '/', 'UTF-8' ) ) 330 $outPath .= '/'; 331 return $outPath; 332 } 333 334 /** 335 * Required function of URL to absolute 336 * 337 * Inspired from code available at http://nadeausoftware.com/node/79, 338 * Code distributed under OSI BSD (http://www.opensource.org/licenses/bsd-license.php) 339 * 340 */ 341 function split_url( $url, $decode=TRUE ) 342 { 343 $xunressub = 'a-zA-Z\d\-._~\!$&\'()*+,;='; 344 $xpchar = $xunressub . ':@%'; 345 346 $xscheme = '([a-zA-Z][a-zA-Z\d+-.]*)'; 347 348 $xuserinfo = '(([' . $xunressub . '%]*)' . 349 '(:([' . $xunressub . ':%]*))?)'; 350 351 $xipv4 = '(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'; 352 353 $xipv6 = '(\[([a-fA-F\d.:]+)\])'; 354 355 $xhost_name = '([a-zA-Z\d-.%]+)'; 356 357 $xhost = '(' . $xhost_name . '|' . $xipv4 . '|' . $xipv6 . ')'; 358 $xport = '(\d*)'; 359 $xauthority = '((' . $xuserinfo . '@)?' . $xhost . 360 '?(:' . $xport . ')?)'; 361 362 $xslash_seg = '(/[' . $xpchar . ']*)'; 363 $xpath_authabs = '((//' . $xauthority . ')((/[' . $xpchar . ']*)*))'; 364 $xpath_rel = '([' . $xpchar . ']+' . $xslash_seg . '*)'; 365 $xpath_abs = '(/(' . $xpath_rel . ')?)'; 366 $xapath = '(' . $xpath_authabs . '|' . $xpath_abs . 367 '|' . $xpath_rel . ')'; 368 369 $xqueryfrag = '([' . $xpchar . '/?' . ']*)'; 370 371 $xurl = '^(' . $xscheme . ':)?' . $xapath . '?' . 372 '(\?' . $xqueryfrag . ')?(#' . $xqueryfrag . ')?$'; 373 374 375 // Split the URL into components. 376 if ( !preg_match( '!' . $xurl . '!', $url, $m ) ) 377 return FALSE; 378 379 if ( !empty($m[2]) ) $parts['scheme'] = strtolower($m[2]); 380 381 if ( !empty($m[7]) ) { 382 if ( isset( $m[9] ) ) $parts['user'] = $m[9]; 383 else $parts['user'] = ''; 384 } 385 if ( !empty($m[10]) ) $parts['pass'] = $m[11]; 386 387 if ( !empty($m[13]) ) $h=$parts['host'] = $m[13]; 388 else if ( !empty($m[14]) ) $parts['host'] = $m[14]; 389 else if ( !empty($m[16]) ) $parts['host'] = $m[16]; 390 else if ( !empty( $m[5] ) ) $parts['host'] = ''; 391 if ( !empty($m[17]) ) $parts['port'] = $m[18]; 392 393 if ( !empty($m[19]) ) $parts['path'] = $m[19]; 394 else if ( !empty($m[21]) ) $parts['path'] = $m[21]; 395 else if ( !empty($m[25]) ) $parts['path'] = $m[25]; 396 397 if ( !empty($m[27]) ) $parts['query'] = $m[28]; 398 if ( !empty($m[29]) ) $parts['fragment']= $m[30]; 399 400 if ( !$decode ) 401 return $parts; 402 if ( !empty($parts['user']) ) 403 $parts['user'] = rawurldecode( $parts['user'] ); 404 if ( !empty($parts['pass']) ) 405 $parts['pass'] = rawurldecode( $parts['pass'] ); 406 if ( !empty($parts['path']) ) 407 $parts['path'] = rawurldecode( $parts['path'] ); 408 if ( isset($h) ) 409 $parts['host'] = rawurldecode( $parts['host'] ); 410 if ( !empty($parts['query']) ) 411 $parts['query'] = rawurldecode( $parts['query'] ); 412 if ( !empty($parts['fragment']) ) 413 $parts['fragment'] = rawurldecode( $parts['fragment'] ); 414 return $parts; 415 } 416 417 /** 418 * Required function of URL to absolute 419 * 420 * Inspired from code available at http://nadeausoftware.com/node/79, 421 * Code distributed under OSI BSD (http://www.opensource.org/licenses/bsd-license.php) 422 * 423 */ 424 function join_url( $parts, $encode=TRUE ) 425 { 426 if ( $encode ) 427 { 428 if ( isset( $parts['user'] ) ) 429 $parts['user'] = rawurlencode( $parts['user'] ); 430 if ( isset( $parts['pass'] ) ) 431 $parts['pass'] = rawurlencode( $parts['pass'] ); 432 if ( isset( $parts['host'] ) && 433 !preg_match( '!^(\[[\da-f.:]+\]])|([\da-f.:]+)$!ui', $parts['host'] ) ) 434 $parts['host'] = rawurlencode( $parts['host'] ); 435 if ( !empty( $parts['path'] ) ) 436 $parts['path'] = preg_replace( '!%2F!ui', '/', 437 rawurlencode( $parts['path'] ) ); 438 if ( isset( $parts['query'] ) ) 439 $parts['query'] = rawurlencode( $parts['query'] ); 440 if ( isset( $parts['fragment'] ) ) 441 $parts['fragment'] = rawurlencode( $parts['fragment'] ); 442 } 443 444 $url = ''; 445 if ( !empty( $parts['scheme'] ) ) 446 $url .= $parts['scheme'] . ':'; 447 if ( isset( $parts['host'] ) ) 448 { 449 $url .= '//'; 450 if ( isset( $parts['user'] ) ) 451 { 452 $url .= $parts['user']; 453 if ( isset( $parts['pass'] ) ) 454 $url .= ':' . $parts['pass']; 455 $url .= '@'; 456 } 457 if ( preg_match( '!^[\da-f]*:[\da-f.:]+$!ui', $parts['host'] ) ) 458 $url .= '[' . $parts['host'] . ']'; // IPv6 459 else 460 $url .= $parts['host']; // IPv4 or name 461 if ( isset( $parts['port'] ) ) 462 $url .= ':' . $parts['port']; 463 if ( !empty( $parts['path'] ) && $parts['path'][0] != '/' ) 464 $url .= '/'; 465 } 466 if ( !empty( $parts['path'] ) ) 467 $url .= $parts['path']; 468 if ( isset( $parts['query'] ) ) 469 $url .= '?' . $parts['query']; 470 if ( isset( $parts['fragment'] ) ) 471 $url .= '#' . $parts['fragment']; 472 return $url; 473 } 474 475 /** 476 * Returns filesize in human readable terms 477 * 478 * Inspired by code available at http://stackoverflow.com/questions/1222245/calculating-script-memory-usages-in-php 479 * Code distributed under CC-Wiki License (http://creativecommons.org/licenses/by-sa/2.5/) 480 * 481 * @params int $size filesize in bytes 482 */ 483 function file_size($size) { 484 $filesizename = array(" Bytes", " KB", " MB", " GB", " TB", " PB", " EB", " ZB", " YB"); 485 return $size ? round($size/pow(1024, ($i = floor(log($size, 1024)))), 2) . $filesizename[$i] : '0 Bytes'; 486 } 487 488 ?>
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
| Generated: Thu Jun 3 17:10:09 2010 | Cross-referenced by PHPXref 0.7 |