[ Index ]

PHP Cross Reference of Crawler

title

Body

[close]

/ -> crawl.php (source)

   1  <?php
   2  /**
   3   * 
   4   * Main crawler function.  Can be run manually or via cron.
   5   *
   6   * Append "?debug" to URL for verbose output, but performance will degrade over time (as browser buffer fills)
   7   *
   8   */
   9  
  10  
  11  /**
  12   * Call necesary include files
  13   */
  14  include ('config.php');
  15  include ('includes/functions.php');
  16  include ('includes/mysql_functions.php');
  17  
  18  /**
  19   * Parse domain list into an array
  20   */
  21  $domain_array = explode(',',$domains);
  22  ?>
  23  <html>
  24      <body>
  25      <?php
  26      echo "<p>STARTED: " . date('Y-m-d H:i:s') . "</p>"; 
  27      echo "<p>Crawling...</p>";
  28      
  29      /*
  30       * Grab list of uncrawled URLs, repeat while there are still URLs to crawl
  31       */
  32      while ($urls = uncrawled_urls()) {
  33  
  34          /**
  35           * Loop through the array of uncrawled URLs
  36           */
  37          foreach ($urls as $id=>$url_data) {
  38  
  39              /**
  40               * If we're in debug mode, indicate that we are begining to crawl a new URL
  41               */
  42              if (isset($_GET['debug']))
  43                  echo "<p style='font-weight:bold'>Starting to crawl " . urldecode($url_data['url']) . "</p><ul>";
  44              
  45              /** 
  46               * If this is a seed URL, set clicks to zero, 
  47               * otherwise, increment our internal click counter one beyond the parent's clicks
  48               */
  49              if (!isset($url_data['clicks'])) $clicks = 0;
  50              else $clicks = $url_data['clicks'] + 1;
  51              
  52              /**
  53               * Curl the page, returning data as an array
  54               */
  55              $page_data = curl_page($url_data['url']);
  56              
  57              /**
  58               * Calculate the directory of the current page, used to parse relative URLs
  59               */
  60              $dir = parse_dir($url_data['url']);
  61              
  62              /**
  63               * Parse the title of the current page
  64               */
  65              $title = parse_title($page_data['html']);
  66              
  67              /**
  68               * Parse the HTML for links, store in an array
  69               */
  70              $links = parse_links($page_data['html']);
  71  
  72              /**
  73               * Loop through the array of links
  74               */
  75              foreach ($links as $key => &$link) {
  76                  /**
  77                   * Uniformly clean the link so we don't have duplicates (absolute, no anchors, add www., etc.)
  78                   */
  79                  $link = clean_link($link, $dir);
  80                  
  81                  /**
  82                   * If the link is to an image, do not add it
  83                   */
  84                  if (is_image($link)) continue;
  85                  
  86                  /**
  87                   * Verify that the link target is within our array of domains
  88                   */
  89                  if (out_of_domain($link)) continue;
  90                  
  91                  /**
  92                   * Verify that the link is not a mailto: link
  93                   */ 
  94                  if (is_mailto($link)) continue;
  95                  
  96                  /**
  97                   * CUSTOM LAZY HACK, REMOVE BEFORE OPEN-SOURCING, OTHERWISE THIS WOULD BE EMBARASSING
  98                   */
  99                  if (stripos($link,'/calendar?') != FALSE) continue;
 100                  
 101                  /**
 102                   * Check to see if the URL is already in the table, if so, grab its ID number
 103                   */
 104                  $to = have_url($link);
 105                  
 106                  /**
 107                   * If the link is not in the table, add it
 108                   */
 109                  if (!$to) {
 110                      /**
 111                       * Output that we're adding a URL if we're in verbose mode
 112                       */
 113                      if (isset($_GET['debug']))
 114                          echo "<li>Adding url " . urldecode($link) . " to list</li>";
 115                          
 116                      /**
 117                       * Add URL to table, grab link ID #
 118                       */
 119                      $to = add_url($link,$clicks);
 120                  }
 121                  
 122                  /**
 123                   * If debug mode, indicate that we're adding a link
 124                   */
 125                  if (isset($_GET['debug']))
 126                      echo "<li>Adding link from here to " . urldecode($link) . "</li>";
 127                      
 128                  /**
 129                   * Add the link to the links table
 130                   */
 131                  add_link($id,$to);
 132              }
 133              
 134              /**
 135               * If the server did not report a size (in which case cURL returns '-1'), 
 136               * use the size of the cURL as the file size, otherwise, trust the server
 137               */
 138              if ($page_data['reported_size'] != -1) $size = $page_data['reported_size'];
 139              else $size = $page_data['actual_size'];
 140          
 141              /**
 142               * If the server returned a modifed header, trust it, otherwise (return of '-1' from cURL) NULL the string.
 143               */
 144              if ($page_data['modified'] != -1) $modified = $page_data['modified'];
 145              else $modified = NULL;
 146              
 147              /**
 148               * Format the Data array
 149               */ 
 150              $data = array(    'crawled'=>1,
 151                              'title'=>$title,
 152                              'http_code' => $page_data['http_code'],
 153                              'size' => $size,
 154                              'type' => $page_data['type'],
 155                              'modified' => $modified, 
 156                              );
 157              /**
 158               *  Store data
 159               */
 160              mysql_update('urls',$data,array('ID'=>$id));
 161              
 162              /**
 163               * If in debug mode, close the <ul> we opened above
 164               */
 165              if (isset($_GET['debug']))
 166                  echo "</ul>";
 167                  
 168          } //End foreach URL
 169          
 170      } //End While uncrawled URLs
 171      
 172      /**
 173       * If we're done, let the user know the good news
 174       */
 175      if (sizeof($urls) == 0)    echo "<p>No URLs to crawl!</p>";
 176      echo "<p>FINISHED: " . date('Y-m-d H:i:s') . "</p>"; 
 177      ?>
 178      </body>
 179  <html>


Generated: Thu Jun 3 17:10:09 2010 Cross-referenced by PHPXref 0.7