[ Index ]

PHP Cross Reference of Crawler

title

Body

[close]

/includes/ -> functions.php (source)

   1  <?php
   2  /**
   3   *
   4   * Main crawler functions
   5   *
   6   * @package crawler
   7   *
   8   */
   9  
  10   /**
  11    * Curl and Parsing Functions
  12    */
  13   
  14   /**
  15    * cURL Function which returns HTML and page info as array
  16    *
  17    * @params string $url URL to cURL
  18    * @return array Associative array of results
  19    */
  20  function curl_page($url) {
  21      $ch = curl_init($url);
  22      $options = array(
  23              CURLOPT_HEADER => false,
  24              CURLOPT_COOKIEJAR => 'cookie.txt',
  25              CURLOPT_COOKIEFILE    => 'cookie.txt',
  26              CURLOPT_USERAGENT => 'Mozilla/5.0 (FCC New Media Web Crawler)',
  27              CURLOPT_FOLLOWLOCATION => true,
  28              CURLOPT_RETURNTRANSFER => true,
  29              CURLOPT_FILETIME => true,
  30              CURLOPT_TIMEOUT => 10
  31              );
  32      curl_setopt_array($ch, $options);
  33      $output['html'] = curl_exec($ch);
  34      $output['http_code'] = curl_getinfo($ch,CURLINFO_HTTP_CODE);
  35      $output['reported_size'] = curl_getinfo($ch,CURLINFO_CONTENT_LENGTH_DOWNLOAD);
  36      $output['actual_size'] = curl_getinfo($ch,CURLINFO_SIZE_DOWNLOAD);
  37      $output['type'] = curl_getinfo($ch,CURLINFO_CONTENT_TYPE);
  38      $output['modified'] = curl_getinfo($ch,CURLINFO_FILETIME);    
  39      curl_close($ch);
  40      return $output;
  41  }
  42  
  43  /**
  44   * Function to parse page for title tags
  45   *
  46   * @params string $data HTML of page
  47   * @return string|bool title of page of null if not found
  48   */
  49  function parse_title($data) {
  50      if (preg_match('#<title>(.*)</title>#is',$data,$title)) return trim($title[1]);
  51      else return null;
  52  }
  53  
  54  /**
  55   * Function to parse page for links
  56   *
  57   * @params string $data HTML of target page
  58   * @return array Numeric array of links on page (URLs only)
  59   */
  60  function parse_links($data) {
  61      $regexp = "<a\s[^>]*href=([\"|']??)([^\"' >]*?)\\1[^>]*>(.*)<\/a>";
  62        if(preg_match_all("/$regexp/siU", $data, $matches)) return $matches[2];
  63        else return array();
  64  }
  65  
  66  /**
  67   * Given a URL calculates the page's directory
  68   *
  69   * @params string $url target URL
  70   * @return string Directory
  71   */
  72  function parse_dir($url) {
  73      $slash = strrpos($url,'/');
  74      return substr($url,0,$slash+1);
  75  }
  76  
  77  /**
  78   * Link Checking Functions
  79   */
  80  
  81  /**
  82   * Uniformly cleans a link to avoid duplicates
  83   *
  84   * 1. Changes relative links to absolute (/bar to http://www.foo.com/bar)
  85   * 2. Removes anchor tags (foo.html#bar to foo.html)
  86   * 3. Adds trailing slash if directory (foo.com/bar to foo.com/bar/)
  87   * 4. Adds www if there is not a subdomain (foo.com to www.foo.com but not bar.foo.com)
  88   *
  89   * @params string $link link to clean
  90   * @parmas string $dir directory of parent (linking) page
  91   * @return strin cleaned link
  92   */
  93  function clean_link($link, $dir) {
  94      $link = url_to_absolute($dir, $link); //make them absolute, not relative
  95      if (stripos($link,'#') != FALSE) $link = substr($link,0,stripos($link,'#')); //remove anchors
  96      if (!preg_match('#(^http://(.*)/$)|http://(.*)/(.*)\.([A-Za-z0-9]+)|http://(.*)/([^\?\#]*)(\?|\#)([^/]*)#i',$link))  $link .= '/';
  97      $link = preg_replace('#http://([^.]+).([a-zA-z]{3})/#i','http://www.$1.$2/',$link);
  98      return $link;
  99  }
 100  
 101  
 102  /**
 103   * Performs a regular expressoin to see if a given link is an image
 104   *
 105   * @params string $link target link
 106   * @return bool true on image, false on anything else
 107   */
 108  function is_image($link) {
 109      if (preg_match('%\.(gif|jpe?g|png|bmp)$%i',$link)) return true;
 110      else return false;
 111  }
 112  
 113  /**
 114   * Checks to see that a given link is within the domain whitelist
 115   *
 116   * Note to self: this can be rewritten using a single regex command
 117   *
 118   * @params string $link target link
 119   * @return bool true if out of domain, false if on domain whitelist
 120   */
 121  function out_of_domain($link) {
 122      global $domain_array;
 123      foreach ($domain_array as $domain) {
 124          if (stripos($link,trim($domain)) != FALSE) return false;
 125      }
 126      return true;
 127  }
 128  
 129  /**
 130   * Checks to see if a given link is in fact a mailto: link
 131   *
 132   * @params string $link Link to check
 133   * @return bool true on mailto:, false on everything else
 134   */
 135  function is_mailto($link) {
 136      if (stripos($link,'mailto:')===FALSE) return false;
 137      else return true;
 138  }
 139  
 140  /*
 141   * Data storage and retrieval functions
 142   */
 143  
 144  /**
 145   * Adds a URL to the URLs table upon discovery in a link
 146   *
 147   * @params string $link URL to add
 148   * @params int $clicks number of clicks from initial page
 149   * @return bool true on sucess, false on fail
 150   */
 151  function add_url($link,$clicks) {
 152      return mysql_insert('urls',array('url'=>urldecode($link),'clicks'=>$clicks));
 153  }
 154  
 155  /**
 156   * Adds a link to the links table
 157   *
 158   * @params int $form ID of linking page
 159   * @params int $to ID of target page
 160   * @return int|bool LinkID on sucess, false on fail
 161   */
 162  function add_link($from,$to) {
 163      if ($from == $to) return false;
 164      if (mysql_exists('links',array('from'=>$from,'to'=>$to))) return false;
 165      else return mysql_insert('links',array('from'=>$from,'to'=>$to));
 166  }
 167  
 168  /**
 169   * Grab all links on a given page, optionally for a specific depth
 170   *
 171   * @params int $pageID pageID
 172   * @params int $click optionally the number of clicks from the homepage to restrict results
 173   * @return array Multidimensional array keyed by target pageID with page data
 174   */
 175  function get_links($pageID,$click = '') {
 176      $links = mysql_array(mysql_select('links',array('from'=>$pageID)),FALSE);
 177      foreach ($links as $link) $output[$link['to']] = get_page($link['to']);
 178      return $output;
 179  }
 180  
 181  /**
 182   * Shorthand MySQL function to count links in or out of a given page
 183   *
 184   * @params int $pageID subject page
 185   * @params string $direction Direction to retrieve (either "to" or "from")
 186   * @return int Number of links
 187   */
 188  function count_links($pageID,$direction) {
 189      $result = mysql_select('links',array($direction=>$pageID));
 190      return mysql_num_rows($result);
 191  }
 192  
 193  /**
 194   * Shorthand MySQL function to get a particular page's row
 195   *
 196   * @params int $pageID target page
 197   * @return array Associative array of page data
 198   */
 199  function get_page($pageID) {
 200      return mysql_row_array(mysql_select('urls',array('ID'=>$pageID)));
 201  }
 202  
 203  
 204  /**
 205   * Shorthand MySQL function to to get the first 100 uncrawled URLs 
 206   *
 207   * @return array Associative array of uncrawled URLs & page data
 208   */
 209  function uncrawled_urls() {
 210      return mysql_array(mysql_query("SELECT * FROM `urls` WHERE `crawled` = '0' LIMIT 100"));
 211  }
 212  
 213  /**
 214   * Checks to see if a given URL is already in the pages table
 215   *
 216   * @params string $link URL to check
 217   * @return bool true if URL exists, false if not found
 218   */
 219  function have_url($url) {
 220      $url = mysql_row_array(mysql_select('urls',array('url'=>urldecode($url))));
 221      if (sizeof($url)==0) return false;
 222      else return $url['ID'];
 223  }
 224  
 225  /* Depreciated (I think)
 226  
 227  function count_slashes($url) {
 228      if (strlen($url)<7) return 0;
 229      return substr_count($url,'/',7);
 230  }
 231  
 232  function get_slashes($url) {
 233      if (preg_match_all('#/#',$url,$matches,PREG_OFFSET_CAPTURE,7)) return $matches[0];
 234      else return array();
 235  }
 236  */
 237  
 238  /**
 239   * Converts a relative URL (/bar) to an absolute URL (http://www.foo.com/bar)
 240   *
 241   * Inspired from code available at http://nadeausoftware.com/node/79, 
 242   * Code distributed under OSI BSD (http://www.opensource.org/licenses/bsd-license.php)
 243   * 
 244   * @params string $baseUrl Directory of linking page
 245   * @params string $relativeURL URL to convert to absolute
 246   * @return string Absolute URL
 247   */
 248  function url_to_absolute( $baseUrl, $relativeUrl ) {
 249      // If relative URL has a scheme, clean path and return.
 250      $r = split_url( $relativeUrl );
 251      if ( $r === FALSE )
 252          return FALSE;
 253      if ( !empty( $r['scheme'] ) )
 254      {
 255          if ( !empty( $r['path'] ) && $r['path'][0] == '/' )
 256              $r['path'] = url_remove_dot_segments( $r['path'] );
 257          return join_url( $r );
 258      }
 259   
 260      // Make sure the base URL is absolute.
 261      $b = split_url( $baseUrl );
 262      if ( $b === FALSE || empty( $b['scheme'] ) || empty( $b['host'] ) )
 263          return FALSE;
 264      $r['scheme'] = $b['scheme'];
 265   
 266      // If relative URL has an authority, clean path and return.
 267      if ( isset( $r['host'] ) )
 268      {
 269          if ( !empty( $r['path'] ) )
 270              $r['path'] = url_remove_dot_segments( $r['path'] );
 271          return join_url( $r );
 272      }
 273      unset( $r['port'] );
 274      unset( $r['user'] );
 275      unset( $r['pass'] );
 276   
 277      // Copy base authority.
 278      $r['host'] = $b['host'];
 279      if ( isset( $b['port'] ) ) $r['port'] = $b['port'];
 280      if ( isset( $b['user'] ) ) $r['user'] = $b['user'];
 281      if ( isset( $b['pass'] ) ) $r['pass'] = $b['pass'];
 282   
 283      // If relative URL has no path, use base path
 284      if ( empty( $r['path'] ) )
 285      {
 286          if ( !empty( $b['path'] ) )
 287              $r['path'] = $b['path'];
 288          if ( !isset( $r['query'] ) && isset( $b['query'] ) )
 289              $r['query'] = $b['query'];
 290          return join_url( $r );
 291      }
 292   
 293      // If relative URL path doesn't start with /, merge with base path
 294      if ( $r['path'][0] != '/' )
 295      {
 296          $base = mb_strrchr( $b['path'], '/', TRUE, 'UTF-8' );
 297          if ( $base === FALSE ) $base = '';
 298          $r['path'] = $base . '/' . $r['path'];
 299      }
 300      $r['path'] = url_remove_dot_segments( $r['path'] );
 301      return join_url( $r );
 302  }
 303  
 304  /**
 305   * Required function of URL to absolute
 306   *
 307   * Inspired from code available at http://nadeausoftware.com/node/79, 
 308   * Code distributed under OSI BSD (http://www.opensource.org/licenses/bsd-license.php)
 309   * 
 310   */
 311  function url_remove_dot_segments( $path ) {
 312      // multi-byte character explode
 313      $inSegs  = preg_split( '!/!u', $path );
 314      $outSegs = array( );
 315      foreach ( $inSegs as $seg )
 316      {
 317          if ( $seg == '' || $seg == '.')
 318              continue;
 319          if ( $seg == '..' )
 320              array_pop( $outSegs );
 321          else
 322              array_push( $outSegs, $seg );
 323      }
 324      $outPath = implode( '/', $outSegs );
 325      if ( $path[0] == '/' )
 326          $outPath = '/' . $outPath;
 327      // compare last multi-byte character against '/'
 328      if ( $outPath != '/' &&
 329          (mb_strlen($path)-1) == mb_strrpos( $path, '/', 'UTF-8' ) )
 330          $outPath .= '/';
 331      return $outPath;
 332  }
 333  
 334  /**
 335   * Required function of URL to absolute
 336   *
 337   * Inspired from code available at http://nadeausoftware.com/node/79, 
 338   * Code distributed under OSI BSD (http://www.opensource.org/licenses/bsd-license.php)
 339   * 
 340   */
 341  function split_url( $url, $decode=TRUE )
 342  {
 343      $xunressub     = 'a-zA-Z\d\-._~\!$&\'()*+,;=';
 344      $xpchar        = $xunressub . ':@%';
 345  
 346      $xscheme       = '([a-zA-Z][a-zA-Z\d+-.]*)';
 347  
 348      $xuserinfo     = '((['  . $xunressub . '%]*)' .
 349                       '(:([' . $xunressub . ':%]*))?)';
 350  
 351      $xipv4         = '(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})';
 352  
 353      $xipv6         = '(\[([a-fA-F\d.:]+)\])';
 354  
 355      $xhost_name    = '([a-zA-Z\d-.%]+)';
 356  
 357      $xhost         = '(' . $xhost_name . '|' . $xipv4 . '|' . $xipv6 . ')';
 358      $xport         = '(\d*)';
 359      $xauthority    = '((' . $xuserinfo . '@)?' . $xhost .
 360                       '?(:' . $xport . ')?)';
 361  
 362      $xslash_seg    = '(/[' . $xpchar . ']*)';
 363      $xpath_authabs = '((//' . $xauthority . ')((/[' . $xpchar . ']*)*))';
 364      $xpath_rel     = '([' . $xpchar . ']+' . $xslash_seg . '*)';
 365      $xpath_abs     = '(/(' . $xpath_rel . ')?)';
 366      $xapath        = '(' . $xpath_authabs . '|' . $xpath_abs .
 367                       '|' . $xpath_rel . ')';
 368  
 369      $xqueryfrag    = '([' . $xpchar . '/?' . ']*)';
 370  
 371      $xurl          = '^(' . $xscheme . ':)?' .  $xapath . '?' .
 372                       '(\?' . $xqueryfrag . ')?(#' . $xqueryfrag . ')?$';
 373   
 374   
 375      // Split the URL into components.
 376      if ( !preg_match( '!' . $xurl . '!', $url, $m ) )
 377          return FALSE;
 378   
 379      if ( !empty($m[2]) )        $parts['scheme']  = strtolower($m[2]);
 380   
 381      if ( !empty($m[7]) ) {
 382          if ( isset( $m[9] ) )   $parts['user']    = $m[9];
 383          else            $parts['user']    = '';
 384      }
 385      if ( !empty($m[10]) )       $parts['pass']    = $m[11];
 386   
 387      if ( !empty($m[13]) )       $h=$parts['host'] = $m[13];
 388      else if ( !empty($m[14]) )  $parts['host']    = $m[14];
 389      else if ( !empty($m[16]) )  $parts['host']    = $m[16];
 390      else if ( !empty( $m[5] ) ) $parts['host']    = '';
 391      if ( !empty($m[17]) )       $parts['port']    = $m[18];
 392   
 393      if ( !empty($m[19]) )       $parts['path']    = $m[19];
 394      else if ( !empty($m[21]) )  $parts['path']    = $m[21];
 395      else if ( !empty($m[25]) )  $parts['path']    = $m[25];
 396   
 397      if ( !empty($m[27]) )       $parts['query']   = $m[28];
 398      if ( !empty($m[29]) )       $parts['fragment']= $m[30];
 399   
 400      if ( !$decode )
 401          return $parts;
 402      if ( !empty($parts['user']) )
 403          $parts['user']     = rawurldecode( $parts['user'] );
 404      if ( !empty($parts['pass']) )
 405          $parts['pass']     = rawurldecode( $parts['pass'] );
 406      if ( !empty($parts['path']) )
 407          $parts['path']     = rawurldecode( $parts['path'] );
 408      if ( isset($h) )
 409          $parts['host']     = rawurldecode( $parts['host'] );
 410      if ( !empty($parts['query']) )
 411          $parts['query']    = rawurldecode( $parts['query'] );
 412      if ( !empty($parts['fragment']) )
 413          $parts['fragment'] = rawurldecode( $parts['fragment'] );
 414      return $parts;
 415  }
 416  
 417  /**
 418   * Required function of URL to absolute
 419   *
 420   * Inspired from code available at http://nadeausoftware.com/node/79, 
 421   * Code distributed under OSI BSD (http://www.opensource.org/licenses/bsd-license.php)
 422   * 
 423   */
 424  function join_url( $parts, $encode=TRUE )
 425  {
 426      if ( $encode )
 427      {
 428          if ( isset( $parts['user'] ) )
 429              $parts['user']     = rawurlencode( $parts['user'] );
 430          if ( isset( $parts['pass'] ) )
 431              $parts['pass']     = rawurlencode( $parts['pass'] );
 432          if ( isset( $parts['host'] ) &&
 433              !preg_match( '!^(\[[\da-f.:]+\]])|([\da-f.:]+)$!ui', $parts['host'] ) )
 434              $parts['host']     = rawurlencode( $parts['host'] );
 435          if ( !empty( $parts['path'] ) )
 436              $parts['path']     = preg_replace( '!%2F!ui', '/',
 437                  rawurlencode( $parts['path'] ) );
 438          if ( isset( $parts['query'] ) )
 439              $parts['query']    = rawurlencode( $parts['query'] );
 440          if ( isset( $parts['fragment'] ) )
 441              $parts['fragment'] = rawurlencode( $parts['fragment'] );
 442      }
 443   
 444      $url = '';
 445      if ( !empty( $parts['scheme'] ) )
 446          $url .= $parts['scheme'] . ':';
 447      if ( isset( $parts['host'] ) )
 448      {
 449          $url .= '//';
 450          if ( isset( $parts['user'] ) )
 451          {
 452              $url .= $parts['user'];
 453              if ( isset( $parts['pass'] ) )
 454                  $url .= ':' . $parts['pass'];
 455              $url .= '@';
 456          }
 457          if ( preg_match( '!^[\da-f]*:[\da-f.:]+$!ui', $parts['host'] ) )
 458              $url .= '[' . $parts['host'] . ']'; // IPv6
 459          else
 460              $url .= $parts['host'];             // IPv4 or name
 461          if ( isset( $parts['port'] ) )
 462              $url .= ':' . $parts['port'];
 463          if ( !empty( $parts['path'] ) && $parts['path'][0] != '/' )
 464              $url .= '/';
 465      }
 466      if ( !empty( $parts['path'] ) )
 467          $url .= $parts['path'];
 468      if ( isset( $parts['query'] ) )
 469          $url .= '?' . $parts['query'];
 470      if ( isset( $parts['fragment'] ) )
 471          $url .= '#' . $parts['fragment'];
 472      return $url;
 473  }
 474  
 475  /**
 476   * Returns filesize in human readable terms
 477   *
 478   * Inspired by code available at http://stackoverflow.com/questions/1222245/calculating-script-memory-usages-in-php
 479   * Code distributed under CC-Wiki License (http://creativecommons.org/licenses/by-sa/2.5/) 
 480   *
 481   * @params int $size filesize in bytes
 482   */
 483      function file_size($size)  {
 484          $filesizename = array(" Bytes", " KB", " MB", " GB", " TB", " PB", " EB", " ZB", " YB");
 485          return $size ? round($size/pow(1024, ($i = floor(log($size, 1024)))), 2) . $filesizename[$i] : '0 Bytes';
 486      }
 487  
 488  ?>


Generated: Thu Jun 3 17:10:09 2010 Cross-referenced by PHPXref 0.7