| [ Index ] |
PHP Cross Reference of Crawler |
[Summary view] [Print] [Text view]
1 <?php 2 /** 3 * 4 * Main crawler function. Can be run manually or via cron. 5 * 6 * Append "?debug" to URL for verbose output, but performance will degrade over time (as browser buffer fills) 7 * 8 */ 9 10 11 /** 12 * Call necesary include files 13 */ 14 include ('config.php'); 15 include ('includes/functions.php'); 16 include ('includes/mysql_functions.php'); 17 18 /** 19 * Parse domain list into an array 20 */ 21 $domain_array = explode(',',$domains); 22 ?> 23 <html> 24 <body> 25 <?php 26 echo "<p>STARTED: " . date('Y-m-d H:i:s') . "</p>"; 27 echo "<p>Crawling...</p>"; 28 29 /* 30 * Grab list of uncrawled URLs, repeat while there are still URLs to crawl 31 */ 32 while ($urls = uncrawled_urls()) { 33 34 /** 35 * Loop through the array of uncrawled URLs 36 */ 37 foreach ($urls as $id=>$url_data) { 38 39 /** 40 * If we're in debug mode, indicate that we are begining to crawl a new URL 41 */ 42 if (isset($_GET['debug'])) 43 echo "<p style='font-weight:bold'>Starting to crawl " . urldecode($url_data['url']) . "</p><ul>"; 44 45 /** 46 * If this is a seed URL, set clicks to zero, 47 * otherwise, increment our internal click counter one beyond the parent's clicks 48 */ 49 if (!isset($url_data['clicks'])) $clicks = 0; 50 else $clicks = $url_data['clicks'] + 1; 51 52 /** 53 * Curl the page, returning data as an array 54 */ 55 $page_data = curl_page($url_data['url']); 56 57 /** 58 * Calculate the directory of the current page, used to parse relative URLs 59 */ 60 $dir = parse_dir($url_data['url']); 61 62 /** 63 * Parse the title of the current page 64 */ 65 $title = parse_title($page_data['html']); 66 67 /** 68 * Parse the HTML for links, store in an array 69 */ 70 $links = parse_links($page_data['html']); 71 72 /** 73 * Loop through the array of links 74 */ 75 foreach ($links as $key => &$link) { 76 /** 77 * Uniformly clean the link so we don't have duplicates (absolute, no anchors, add www., etc.) 78 */ 79 $link = clean_link($link, $dir); 80 81 /** 82 * If the link is to an image, do not add it 83 */ 84 if (is_image($link)) continue; 85 86 /** 87 * Verify that the link target is within our array of domains 88 */ 89 if (out_of_domain($link)) continue; 90 91 /** 92 * Verify that the link is not a mailto: link 93 */ 94 if (is_mailto($link)) continue; 95 96 /** 97 * CUSTOM LAZY HACK, REMOVE BEFORE OPEN-SOURCING, OTHERWISE THIS WOULD BE EMBARASSING 98 */ 99 if (stripos($link,'/calendar?') != FALSE) continue; 100 101 /** 102 * Check to see if the URL is already in the table, if so, grab its ID number 103 */ 104 $to = have_url($link); 105 106 /** 107 * If the link is not in the table, add it 108 */ 109 if (!$to) { 110 /** 111 * Output that we're adding a URL if we're in verbose mode 112 */ 113 if (isset($_GET['debug'])) 114 echo "<li>Adding url " . urldecode($link) . " to list</li>"; 115 116 /** 117 * Add URL to table, grab link ID # 118 */ 119 $to = add_url($link,$clicks); 120 } 121 122 /** 123 * If debug mode, indicate that we're adding a link 124 */ 125 if (isset($_GET['debug'])) 126 echo "<li>Adding link from here to " . urldecode($link) . "</li>"; 127 128 /** 129 * Add the link to the links table 130 */ 131 add_link($id,$to); 132 } 133 134 /** 135 * If the server did not report a size (in which case cURL returns '-1'), 136 * use the size of the cURL as the file size, otherwise, trust the server 137 */ 138 if ($page_data['reported_size'] != -1) $size = $page_data['reported_size']; 139 else $size = $page_data['actual_size']; 140 141 /** 142 * If the server returned a modifed header, trust it, otherwise (return of '-1' from cURL) NULL the string. 143 */ 144 if ($page_data['modified'] != -1) $modified = $page_data['modified']; 145 else $modified = NULL; 146 147 /** 148 * Format the Data array 149 */ 150 $data = array( 'crawled'=>1, 151 'title'=>$title, 152 'http_code' => $page_data['http_code'], 153 'size' => $size, 154 'type' => $page_data['type'], 155 'modified' => $modified, 156 ); 157 /** 158 * Store data 159 */ 160 mysql_update('urls',$data,array('ID'=>$id)); 161 162 /** 163 * If in debug mode, close the <ul> we opened above 164 */ 165 if (isset($_GET['debug'])) 166 echo "</ul>"; 167 168 } //End foreach URL 169 170 } //End While uncrawled URLs 171 172 /** 173 * If we're done, let the user know the good news 174 */ 175 if (sizeof($urls) == 0) echo "<p>No URLs to crawl!</p>"; 176 echo "<p>FINISHED: " . date('Y-m-d H:i:s') . "</p>"; 177 ?> 178 </body> 179 <html>
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
| Generated: Thu Jun 3 17:10:09 2010 | Cross-referenced by PHPXref 0.7 |