2021-08-10 12:47:06 +02:00
#!/usr/bin/env php
< ? php
/*
* This program is free software ; you can redistribute it and / or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation ; either version 3 of the License , or
* ( at your option ) any later version .
*
* This program is distributed in the hope that it will be useful ,
* but WITHOUT ANY WARRANTY ; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the
* GNU General Public License for more details .
*
* You should have received a copy of the GNU General Public License
* along with this program . If not , see < https :// www . gnu . org / licenses />.
*/
/**
* \file dev / tools / spider . php
* \brief Script to spider Dolibarr app .
2021-08-10 12:55:43 +02:00
*
* To use it :
* - Disable module " bookmark "
* - Exclude param optioncss , token , sortfield , sortorder
2021-08-10 12:47:06 +02:00
*/
$crawledLinks = array ();
const MAX_DEPTH = 2 ;
/**
* @ param string $url URL
* @ param string $depth Depth
* @ return string String
*/
function followLink ( $url , $depth = 0 )
{
global $crawledLinks ;
$crawling = array ();
if ( $depth > MAX_DEPTH ) {
echo " <div style='color:red;'>The Crawler is giving up!</div> " ;
return ;
}
$options = array (
'http' => array (
'method' => " GET " ,
'user-agent' => " gfgBot/0.1 \n "
)
);
$context = stream_context_create ( $options );
$doc = new DomDocument ();
@ $doc -> loadHTML ( file_get_contents ( $url , false , $context ));
$links = $doc -> getElementsByTagName ( 'a' );
$pageTitle = getDocTitle ( $doc , $url );
$metaData = getDocMetaData ( $doc );
foreach ( $links as $i ) {
$link = $i -> getAttribute ( 'href' );
if ( ignoreLink ( $link )) continue ;
$link = convertLink ( $url , $link );
if ( ! in_array ( $link , $crawledLinks )) {
$crawledLinks [] = $link ;
$crawling [] = $link ;
insertIntoDatabase ( $link , $pageTitle , $metaData , $depth );
}
}
foreach ( $crawling as $crawlURL )
followLink ( $crawlURL , $depth + 1 );
}
/**
* @ param string $site Site
* @ param string $path Path
* @ return string String
*/
function convertLink ( $site , $path )
{
if ( substr_compare ( $path , " // " , 0 , 2 ) == 0 )
return parse_url ( $site )[ 'scheme' ] . $path ;
elseif ( substr_compare ( $path , " http:// " , 0 , 7 ) == 0 or
substr_compare ( $path , " https:// " , 0 , 8 ) == 0 or
substr_compare ( $path , " www. " , 0 , 4 ) == 0 )
return $path ;
else return $site . '/' . $path ;
}
/**
* @ param string $url URL
* @ return boolean
*/
function ignoreLink ( $url )
{
return $url [ 0 ] == " # " or substr ( $url , 0 , 11 ) == " javascript: " ;
}
/**
* @ param string $link URL
* @ param string $title Title
* @ param string $metaData Array
* @ param int $depth Depth
* @ return void
*/
function insertIntoDatabase ( $link , $title , & $metaData , $depth )
{
//global $crawledLinks;
echo " Inserting new record { URL= " . $link . " , Title = ' $title ', Description = ' " . $metaData [ 'description' ] . " ', Keywords = ' " . $metaData [ 'keywords' ] . " '}<br/><br/><br/> " ;
//²$crawledLinks[]=$link;
}
/**
* @ param string $doc Doc
* @ param string $url URL
* @ return string URL / Title
*/
function getDocTitle ( & $doc , $url )
{
$titleNodes = $doc -> getElementsByTagName ( 'title' );
if ( count ( $titleNodes ) == 0 or ! isset ( $titleNodes [ 0 ] -> nodeValue ))
return $url ;
$title = str_replace ( '' , '\n' , $titleNodes [ 0 ] -> nodeValue );
return ( strlen ( $title ) < 1 ) ? $url : $title ;
}
/**
* @ param string $doc Doc
* @ return array Array
*/
function getDocMetaData ( & $doc )
{
$metaData = array ();
$metaNodes = $doc -> getElementsByTagName ( 'meta' );
foreach ( $metaNodes as $node )
$metaData [ $node -> getAttribute ( " name " )] = $node -> getAttribute ( " content " );
if ( ! isset ( $metaData [ 'description' ]))
$metaData [ 'description' ] = 'No Description Available' ;
if ( ! isset ( $metaData [ 'keywords' ])) $metaData [ 'keywords' ] = '' ;
return array (
'keywords' => str_replace ( '' , '\n' , $metaData [ 'keywords' ]),
'description' => str_replace ( '' , '\n' , $metaData [ 'description' ])
);
}
followLink ( " http://localhost/dolibarr_dev/htdocs " );