Back to examples |
Simple example using fread_url and regex to extract all URLS from a webpage ( or a file ). For the fread_url to work your server will have to have the relevent options enabled in its php.ini. If your server has CURL enabled and installed that is a better option and is often quicker, if not you can replace the function with a fopen / fget / fclose loop. CURL will work better as a number of site will expect referrer and proper client information.
This code not only returns URLS in links, but also links to images and style sheets etc.
The example at the bottom of the page will extract all HREFs from a given webpage ( URL must include http:// ). The URLS are then made relative. |
<?php
$var = fread_url($url); preg_match_all ("/a[\s]+[^>]*?href[\s]?=[\s\"\']+". "(.*?)[\"\']+.*?>"."([^<]+|.*?)?<\/a>/", $var, &$matches); $matches = $matches[1]; $list = array();
foreach($matches as $var) { print($var."<br>"); }
// The fread_url function allows you to get a complete // page. If CURL is not installed replace the contents with // a fopen / fget loop
function fread_url($url,$ref="") { if(function_exists("curl_init")){ $ch = curl_init(); $user_agent = "Mozilla/4.0 (compatible; MSIE 5.01; ". "Windows NT 5.0)"; $ch = curl_init(); curl_setopt($ch, CURLOPT_USERAGENT, $user_agent); curl_setopt( $ch, CURLOPT_HTTPGET, 1 ); curl_setopt( $ch, CURLOPT_RETURNTRANSFER, 1 ); curl_setopt( $ch, CURLOPT_FOLLOWLOCATION , 1 ); curl_setopt( $ch, CURLOPT_FOLLOWLOCATION , 1 ); curl_setopt( $ch, CURLOPT_URL, $url ); curl_setopt( $ch, CURLOPT_REFERER, $ref ); curl_setopt ($ch, CURLOPT_COOKIEJAR, 'cookie.txt'); $html = curl_exec($ch); curl_close($ch); } else{ $hfile = fopen($url,"r"); if($hfile){ while(!feof($hfile)){ $html.=fgets($hfile,1024); } } } return $html; }
?>
|
|
|
|