Back to index

php5  5.3.10
urlgrab5.php
Go to the documentation of this file.
00001 <?php
00002     /*
00003      * urlgrab5.php
00004      *
00005      * A simple command-line utility to extract all of the URLS contained
00006      * within <A HREF> tags from a document.
00007      *
00008      * NOTE: Only works with tidy for PHP 5, please see urlgrab.php for tidy for PHP 4.3.x
00009      *
00010      * By: John Coggeshall <john@php.net>
00011      *
00012      * Usage: php urlgrab5.php <file>
00013      *
00014      */
00015     function dump_nodes(tidyNode $node, &$urls = NULL) {
00016 
00017        $urls = (is_array($urls)) ? $urls : array();
00018        
00019        if(isset($node->id)) {
00020            if($node->id == TIDY_TAG_A) {
00021               $urls[] = $node->attribute['href'];
00022            }
00023        }
00024                   
00025        if($node->hasChildren()) {
00026 
00027            foreach($node->child as $c) {
00028               dump_nodes($c, $urls);
00029            }
00030 
00031        }
00032        
00033        return $urls;
00034     }
00035 
00036     $a = tidy_parse_file($_SERVER['argv'][1]);
00037     $a->cleanRepair();
00038     print_r(dump_nodes($a->html()));
00039 ?>