dolibarr/dev/tools/spider.php

157 lines
3.9 KiB
PHP
Raw Normal View History

2021-08-10 12:47:06 +02:00
#!/usr/bin/env php
<?php
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
/**
2023-06-30 10:45:58 +02:00
* \file dev/tools/spider.php
* \brief Script to spider Dolibarr app.
2021-08-10 12:55:43 +02:00
*
* To use it:
* - Disable module "bookmark"
* - Exclude param optioncss, token, sortfield, sortorder
2021-08-10 12:47:06 +02:00
*/
2023-12-04 10:22:29 +01:00
$crawledLinks = array();
const MAX_DEPTH = 2;
2021-08-10 12:47:06 +02:00
/**
2023-06-30 10:45:58 +02:00
* @param string $url URL
* @param string $depth Depth
* @return string String
2021-08-10 12:47:06 +02:00
*/
function followLink($url, $depth = 0)
{
global $crawledLinks;
2023-12-04 10:22:29 +01:00
$crawling = array();
if ($depth > MAX_DEPTH) {
2021-08-10 12:47:06 +02:00
echo "<div style='color:red;'>The Crawler is giving up!</div>";
return;
}
2023-12-04 10:22:29 +01:00
$options = array(
'http' => array(
'method' => "GET",
'user-agent' => "gfgBot/0.1\n"
2021-08-10 12:47:06 +02:00
)
);
2023-12-04 10:22:29 +01:00
$context = stream_context_create($options);
$doc = new DomDocument();
2021-08-10 12:47:06 +02:00
@$doc->loadHTML(file_get_contents($url, false, $context));
2023-12-04 10:22:29 +01:00
$links = $doc->getElementsByTagName('a');
$pageTitle = getDocTitle($doc, $url);
$metaData = getDocMetaData($doc);
2021-08-10 12:47:06 +02:00
foreach ($links as $i) {
2023-12-04 10:22:29 +01:00
$link = $i->getAttribute('href');
if (ignoreLink($link)) {
continue;
}
$link = convertLink($url, $link);
2021-08-10 12:47:06 +02:00
if (!in_array($link, $crawledLinks)) {
2023-12-04 10:22:29 +01:00
$crawledLinks[] = $link;
$crawling[] = $link;
2021-08-10 12:47:06 +02:00
insertIntoDatabase($link, $pageTitle, $metaData, $depth);
}
}
2023-12-04 10:22:29 +01:00
foreach ($crawling as $crawlURL) {
followLink($crawlURL, $depth + 1);
}
2021-08-10 12:47:06 +02:00
}
/**
2023-06-30 10:45:58 +02:00
* @param string $site Site
* @param string $path Path
* @return string String
2021-08-10 12:47:06 +02:00
*/
function convertLink($site, $path)
{
2023-12-04 10:22:29 +01:00
if (substr_compare($path, "//", 0, 2) == 0) {
2021-08-10 12:47:06 +02:00
return parse_url($site)['scheme'].$path;
2023-12-04 10:22:29 +01:00
} elseif (substr_compare($path, "http://", 0, 7) == 0
or substr_compare($path, "https://", 0, 8) == 0
or substr_compare($path, "www.", 0, 4) == 0
) {
2021-08-10 12:47:06 +02:00
return $path;
2023-12-04 10:22:29 +01:00
} else {
return $site.'/'.$path;
}
2021-08-10 12:47:06 +02:00
}
/**
2023-06-30 10:45:58 +02:00
* @param string $url URL
2021-08-10 12:47:06 +02:00
* @return boolean
*/
function ignoreLink($url)
{
2023-12-04 10:22:29 +01:00
return $url[0] == "#" or substr($url, 0, 11) == "javascript:";
2021-08-10 12:47:06 +02:00
}
/**
2023-06-30 10:45:58 +02:00
* @param string $link URL
* @param string $title Title
* @param string $metaData Array
* @param int $depth Depth
2021-08-10 12:47:06 +02:00
* @return void
*/
function insertIntoDatabase($link, $title, &$metaData, $depth)
{
//global $crawledLinks;
echo "Inserting new record {URL= ".$link.", Title = '$title', Description = '".$metaData['description']."', Keywords = ' ".$metaData['keywords']."'}<br/><br/><br/>";
//²$crawledLinks[]=$link;
}
/**
2023-06-30 10:45:58 +02:00
* @param string $doc Doc
* @param string $url URL
* @return string URL/Title
2021-08-10 12:47:06 +02:00
*/
function getDocTitle(&$doc, $url)
{
2023-12-04 10:22:29 +01:00
$titleNodes = $doc->getElementsByTagName('title');
if (count($titleNodes) == 0 or !isset($titleNodes[0]->nodeValue)) {
2021-08-10 12:47:06 +02:00
return $url;
2023-12-04 10:22:29 +01:00
}
$title = str_replace('', '\n', $titleNodes[0]->nodeValue);
return (strlen($title) < 1) ? $url : $title;
2021-08-10 12:47:06 +02:00
}
/**
2023-06-30 10:45:58 +02:00
* @param string $doc Doc
* @return array Array
2021-08-10 12:47:06 +02:00
*/
function getDocMetaData(&$doc)
{
2023-12-04 10:22:29 +01:00
$metaData = array();
$metaNodes = $doc->getElementsByTagName('meta');
foreach ($metaNodes as $node) {
2021-08-10 12:47:06 +02:00
$metaData[$node->getAttribute("name")] = $node->getAttribute("content");
2023-12-04 10:22:29 +01:00
}
if (!isset($metaData['description'])) {
$metaData['description'] = 'No Description Available';
}
if (!isset($metaData['keywords'])) {
$metaData['keywords'] = '';
}
2021-08-10 12:47:06 +02:00
return array(
2023-12-04 10:22:29 +01:00
'keywords' => str_replace('', '\n', $metaData['keywords']),
'description' => str_replace('', '\n', $metaData['description'])
2021-08-10 12:47:06 +02:00
);
}
followLink("http://localhost/dolibarr_dev/htdocs");