2013-06-03 17:44:55 +02:00
< ? php
2020-10-27 15:06:16 +01:00
/* Copyright ( C ) 2008 - 2020 Laurent Destailleur < eldy @ users . sourceforge . net >
2013-06-03 17:44:55 +02:00
*
* This program is free software ; you can redistribute it and / or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation ; either version 3 of the License , or
* ( at your option ) any later version .
*
* This program is distributed in the hope that it will be useful ,
* but WITHOUT ANY WARRANTY ; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the
* GNU General Public License for more details .
*
* You should have received a copy of the GNU General Public License
2019-09-23 21:55:30 +02:00
* along with this program . If not , see < https :// www . gnu . org / licenses />.
* or see https :// www . gnu . org /
2013-06-03 17:44:55 +02:00
*/
/**
2013-09-18 23:39:53 +02:00
* \file htdocs / core / lib / geturl . lib . php
2020-10-27 18:02:05 +01:00
* \brief This file contains functions dedicated to get URLs .
2013-06-03 17:44:55 +02:00
*/
/**
2020-10-27 18:02:05 +01:00
* Function to get a content from an URL ( use proxy if proxy defined ) .
* Support Dolibarr setup for timeout and proxy .
2021-06-09 12:41:53 +02:00
* Enhancement of CURL to add an anti SSRF protection :
* - you can set MAIN_SECURITY_ANTI_SSRF_SERVER_IP to set static ip of server
* - common local lookup ips like 127. *.*.* are automatically added
2013-06-03 17:44:55 +02:00
*
2015-08-29 15:06:42 +02:00
* @ param string $url URL to call .
2019-04-22 12:44:25 +02:00
* @ param string $postorget 'POST' , 'GET' , 'HEAD' , 'PUT' , 'PUTALREADYFORMATED' , 'POSTALREADYFORMATED' , 'DELETE'
2020-10-27 15:06:16 +01:00
* @ param string $param Parameters of URL ( x = value1 & y = value2 ) or may be a formated content with $postorget = 'PUTALREADYFORMATED'
* @ param integer $followlocation 0 = Do not follow , 1 = Follow location .
2016-04-09 15:07:55 +02:00
* @ param string [] $addheaders Array of string to add into header . Example : ( 'Accept: application/xrds+xml' , .... )
2020-10-27 15:06:16 +01:00
* @ param string [] $allowedschemes List of schemes that are allowed ( 'http' + 'https' only by default )
* @ param int $localurl 0 = Only external URL are possible , 1 = Only local URL , 2 = Both external and local URL are allowed .
2020-12-16 02:33:21 +01:00
* @ return array Returns an associative array containing the response from the server array ( 'content' => response , 'curl_error_no' => errno , 'curl_error_msg' => errmsg ... )
2013-06-03 17:44:55 +02:00
*/
2020-10-27 15:06:16 +01:00
function getURLContent ( $url , $postorget = 'GET' , $param = '' , $followlocation = 1 , $addheaders = array (), $allowedschemes = array ( 'http' , 'https' ), $localurl = 0 )
2013-06-03 17:44:55 +02:00
{
2020-10-31 14:32:18 +01:00
//declaring of global variables
global $conf ;
$USE_PROXY = empty ( $conf -> global -> MAIN_PROXY_USE ) ? 0 : $conf -> global -> MAIN_PROXY_USE ;
$PROXY_HOST = empty ( $conf -> global -> MAIN_PROXY_HOST ) ? 0 : $conf -> global -> MAIN_PROXY_HOST ;
$PROXY_PORT = empty ( $conf -> global -> MAIN_PROXY_PORT ) ? 0 : $conf -> global -> MAIN_PROXY_PORT ;
$PROXY_USER = empty ( $conf -> global -> MAIN_PROXY_USER ) ? 0 : $conf -> global -> MAIN_PROXY_USER ;
$PROXY_PASS = empty ( $conf -> global -> MAIN_PROXY_PASS ) ? 0 : $conf -> global -> MAIN_PROXY_PASS ;
2013-06-03 17:44:55 +02:00
2013-06-07 20:09:15 +02:00
dol_syslog ( " getURLContent postorget= " . $postorget . " URL= " . $url . " param= " . $param );
2013-06-03 17:44:55 +02:00
2020-10-31 14:32:18 +01:00
//setting the curl parameters.
$ch = curl_init ();
2013-06-03 17:44:55 +02:00
2020-10-31 14:32:18 +01:00
/* print $API_Endpoint . " - " . $API_version . " - " . $PAYPAL_API_USER . " - " . $PAYPAL_API_PASSWORD . " - " . $PAYPAL_API_SIGNATURE . " <br> " ;
2021-02-23 22:03:23 +01:00
print $USE_PROXY . " - " . $gv_ApiErrorURL . " <br> " ;
print $nvpStr ;
exit ; */
2020-10-31 14:32:18 +01:00
curl_setopt ( $ch , CURLOPT_VERBOSE , 1 );
2013-09-18 21:22:24 +02:00
curl_setopt ( $ch , CURLOPT_USERAGENT , 'Dolibarr geturl function' );
2020-10-27 15:06:16 +01:00
// We use @ here because this may return warning if safe mode is on or open_basedir is on (following location is forbidden when safe mode is on).
// We force value to false so we will manage redirection ourself later.
@ curl_setopt ( $ch , CURLOPT_FOLLOWLOCATION , false );
2017-10-01 18:27:54 +02:00
2021-02-23 22:03:23 +01:00
if ( is_array ( $addheaders ) && count ( $addheaders )) {
curl_setopt ( $ch , CURLOPT_HTTPHEADER , $addheaders );
}
2020-04-10 10:59:32 +02:00
curl_setopt ( $ch , CURLINFO_HEADER_OUT , true ); // To be able to retrieve request header and log it
2013-06-03 17:44:55 +02:00
2017-05-08 12:08:43 +02:00
// By default use tls decied by PHP.
// You can force, if supported a version like TLSv1 or TLSv1.2
2021-02-23 22:03:23 +01:00
if ( ! empty ( $conf -> global -> MAIN_CURL_SSLVERSION )) {
curl_setopt ( $ch , CURLOPT_SSLVERSION , $conf -> global -> MAIN_CURL_SSLVERSION );
}
2017-05-08 12:08:43 +02:00
//curl_setopt($ch, CURLOPT_SSLVERSION, 6); for tls 1.2
2017-10-01 18:27:54 +02:00
2020-10-31 14:32:18 +01:00
// Turning off the server and peer verification(TrustManager Concept).
curl_setopt ( $ch , CURLOPT_SSL_VERIFYPEER , false );
curl_setopt ( $ch , CURLOPT_SSL_VERIFYHOST , false );
// Restrict use to some protocols only
$protocols = 0 ;
if ( is_array ( $allowedschemes )) {
foreach ( $allowedschemes as $allowedscheme ) {
2021-02-23 22:03:23 +01:00
if ( $allowedscheme == 'http' ) {
$protocols |= CURLPROTO_HTTP ;
}
if ( $allowedscheme == 'https' ) {
$protocols |= CURLPROTO_HTTPS ;
}
2020-10-31 14:32:18 +01:00
}
curl_setopt ( $ch , CURLOPT_PROTOCOLS , $protocols );
curl_setopt ( $ch , CURLOPT_REDIR_PROTOCOLS , $protocols );
}
curl_setopt ( $ch , CURLOPT_CONNECTTIMEOUT , empty ( $conf -> global -> MAIN_USE_CONNECT_TIMEOUT ) ? 5 : $conf -> global -> MAIN_USE_CONNECT_TIMEOUT );
curl_setopt ( $ch , CURLOPT_TIMEOUT , empty ( $conf -> global -> MAIN_USE_RESPONSE_TIMEOUT ) ? 30 : $conf -> global -> MAIN_USE_RESPONSE_TIMEOUT );
//curl_setopt($ch, CURLOPT_SAFE_UPLOAD, true); // PHP 5.5
curl_setopt ( $ch , CURLOPT_RETURNTRANSFER , 1 ); // We want response
if ( $postorget == 'POST' ) {
curl_setopt ( $ch , CURLOPT_POST , 1 ); // POST
curl_setopt ( $ch , CURLOPT_POSTFIELDS , $param ); // Setting param x=a&y=z as POST fields
} elseif ( $postorget == 'POSTALREADYFORMATED' ) {
curl_setopt ( $ch , CURLOPT_CUSTOMREQUEST , 'POST' ); // HTTP request is 'POST' but param string is taken as it is
curl_setopt ( $ch , CURLOPT_POSTFIELDS , $param ); // param = content of post, like a xml string
} elseif ( $postorget == 'PUT' ) {
$array_param = null ;
curl_setopt ( $ch , CURLOPT_CUSTOMREQUEST , 'PUT' ); // HTTP request is 'PUT'
2021-02-23 22:03:23 +01:00
if ( ! is_array ( $param )) {
parse_str ( $param , $array_param );
} else {
2020-10-31 14:32:18 +01:00
dol_syslog ( " parameter param must be a string " , LOG_WARNING );
$array_param = $param ;
}
curl_setopt ( $ch , CURLOPT_POSTFIELDS , http_build_query ( $array_param )); // Setting param x=a&y=z as PUT fields
} elseif ( $postorget == 'PUTALREADYFORMATED' ) {
curl_setopt ( $ch , CURLOPT_CUSTOMREQUEST , 'PUT' ); // HTTP request is 'PUT'
curl_setopt ( $ch , CURLOPT_POSTFIELDS , $param ); // param = content of post, like a xml string
} elseif ( $postorget == 'HEAD' ) {
curl_setopt ( $ch , CURLOPT_CUSTOMREQUEST , 'HEAD' ); // HTTP request is 'HEAD'
curl_setopt ( $ch , CURLOPT_NOBODY , true );
} elseif ( $postorget == 'DELETE' ) {
curl_setopt ( $ch , CURLOPT_CUSTOMREQUEST , 'DELETE' ); // POST
} else {
curl_setopt ( $ch , CURLOPT_POST , 0 ); // GET
}
//if USE_PROXY constant set at begin of this method.
if ( $USE_PROXY ) {
dol_syslog ( " getURLContent set proxy to " . $PROXY_HOST . " : " . $PROXY_PORT . " - " . $PROXY_USER . " : " . $PROXY_PASS );
//curl_setopt ($ch, CURLOPT_PROXYTYPE, CURLPROXY_HTTP); // Curl 7.10
curl_setopt ( $ch , CURLOPT_PROXY , $PROXY_HOST . " : " . $PROXY_PORT );
2021-02-23 22:03:23 +01:00
if ( $PROXY_USER ) {
curl_setopt ( $ch , CURLOPT_PROXYUSERPWD , $PROXY_USER . " : " . $PROXY_PASS );
}
2020-10-31 14:32:18 +01:00
}
$newUrl = $url ;
$maxRedirection = 5 ;
$info = array ();
$response = '' ;
do {
2021-02-23 22:03:23 +01:00
if ( $maxRedirection < 1 ) {
break ;
}
2020-10-31 14:32:18 +01:00
curl_setopt ( $ch , CURLOPT_URL , $newUrl );
// Parse $newUrl
2020-10-27 18:02:05 +01:00
$newUrlArray = parse_url ( $newUrl );
$hosttocheck = $newUrlArray [ 'host' ];
2020-10-31 14:32:18 +01:00
$hosttocheck = str_replace ( array ( '[' , ']' ), '' , $hosttocheck ); // Remove brackets of IPv6
2020-10-27 18:02:05 +01:00
2021-04-12 11:20:44 +02:00
// Deny some reserved host names
if ( in_array ( $hosttocheck , array ( 'metadata.google.internal' ))) {
$info [ 'http_code' ] = 400 ;
2021-04-25 22:11:57 +02:00
$info [ 'content' ] = 'Error bad hostname ' . $hosttocheck . ' (Used by Google metadata). This value for hostname is not allowed.' ;
2021-04-12 11:20:44 +02:00
break ;
}
// Clean host name $hosttocheck to convert it into an IP $iptocheck
2020-10-27 18:02:05 +01:00
if ( in_array ( $hosttocheck , array ( 'localhost' , 'localhost.domain' ))) {
$iptocheck = '127.0.0.1' ;
2021-04-12 11:20:44 +02:00
} elseif ( in_array ( $hosttocheck , array ( 'ip6-localhost' , 'ip6-loopback' ))) {
$iptocheck = '::1' ;
2020-10-27 18:02:05 +01:00
} else {
2021-04-19 13:52:12 +02:00
// Resolve $hosttocheck to get the IP $iptocheck and set CURLOPT_CONNECT_TO to use this ip so curl will not try another resolution that may give a different result
if ( function_exists ( 'gethostbyname' )) {
$iptocheck = gethostbyname ( $hosttocheck );
} else {
$iptocheck = $hosttocheck ;
}
// TODO Resolve ip v6
2020-10-27 18:02:05 +01:00
}
2021-04-19 13:52:12 +02:00
// Check $iptocheck is an IP (v4 or v6), if not clear value.
if ( ! filter_var ( $iptocheck , FILTER_VALIDATE_IP , FILTER_FLAG_IPV4 | FILTER_FLAG_IPV6 )) { // This is not an IP, we clean data
2021-04-12 11:20:44 +02:00
$iptocheck = '0' ; //
2020-10-27 18:02:05 +01:00
}
if ( $iptocheck ) {
if ( $localurl == 0 ) { // Only external url allowed (dangerous, may allow to get malware)
if ( ! filter_var ( $iptocheck , FILTER_VALIDATE_IP , FILTER_FLAG_NO_PRIV_RANGE | FILTER_FLAG_NO_RES_RANGE )) {
2021-04-12 11:20:44 +02:00
// Deny ips like 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16, 0.0.0.0/8, 169.254.0.0/16, 127.0.0.0/8 et 240.0.0.0/4, ::1/128, ::/128, ::ffff:0:0/96, fe80::/10...
2020-10-27 18:02:05 +01:00
$info [ 'http_code' ] = 400 ;
$info [ 'content' ] = 'Error bad hostname IP (private or reserved range). Must be an external URL.' ;
break ;
}
2021-04-20 12:46:41 +02:00
if ( ! empty ( $_SERVER [ " SERVER_ADDR " ]) && $iptocheck == $_SERVER [ " SERVER_ADDR " ]) {
2021-04-19 13:52:12 +02:00
$info [ 'http_code' ] = 400 ;
$info [ 'content' ] = 'Error bad hostname IP (IP is a local IP). Must be an external URL.' ;
break ;
}
2021-04-19 15:01:36 +02:00
if ( ! empty ( $conf -> global -> MAIN_SECURITY_ANTI_SSRF_SERVER_IP ) && in_array ( $iptocheck , explode ( ',' , $conf -> global -> MAIN_SECURITY_ANTI_SSRF_SERVER_IP ))) {
2020-10-27 18:02:05 +01:00
$info [ 'http_code' ] = 400 ;
2021-04-19 15:01:36 +02:00
$info [ 'content' ] = 'Error bad hostname IP (IP is a local IP defined into MAIN_SECURITY_SERVER_IP). Must be an external URL.' ;
2020-10-27 18:02:05 +01:00
break ;
}
2021-04-25 22:11:57 +02:00
}
if ( $localurl == 1 ) { // Only local url allowed (dangerous, may allow to get metadata on server or make internal port scanning)
2021-06-09 12:41:53 +02:00
// Deny ips NOT like 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16, 0.0.0.0/8, 169.254.0.0/16, 127.0.0.0/8 et 240.0.0.0/4, ::1/128, ::/128, ::ffff:0:0/96, fe80::/10...
2020-10-27 18:02:05 +01:00
if ( filter_var ( $iptocheck , FILTER_VALIDATE_IP , FILTER_FLAG_NO_PRIV_RANGE | FILTER_FLAG_NO_RES_RANGE )) {
$info [ 'http_code' ] = 400 ;
2021-04-25 22:11:57 +02:00
$info [ 'content' ] = 'Error bad hostname ' . $iptocheck . '. Must be a local URL.' ;
2020-10-27 18:02:05 +01:00
break ;
}
2021-06-09 12:41:53 +02:00
if ( ! empty ( $conf -> global -> MAIN_SECURITY_ANTI_SSRF_SERVER_IP ) && ! in_array ( $iptocheck , explode ( ',' , $conf -> global -> MAIN_SECURITY_ANTI_SSRF_SERVER_IP ))) {
2021-04-19 15:16:00 +02:00
$info [ 'http_code' ] = 400 ;
$info [ 'content' ] = 'Error bad hostname IP (IP is not a local IP defined into list MAIN_SECURITY_SERVER_IP). Must be a local URL in allowed list.' ;
break ;
}
2020-10-27 18:02:05 +01:00
}
2021-04-25 22:11:57 +02:00
2021-04-19 15:01:36 +02:00
// Common check (local and external)
if ( in_array ( $iptocheck , array ( '100.100.100.200' ))) {
$info [ 'http_code' ] = 400 ;
$info [ 'content' ] = 'Error bad hostname IP (Used by Alibaba metadata). Must be an external URL.' ;
break ;
}
2021-04-19 13:52:12 +02:00
2021-04-19 20:49:31 +02:00
// Set CURLOPT_CONNECT_TO so curl will not try another resolution that may give a different result. Possible only on PHP v7+
if ( defined ( 'CURLOPT_CONNECT_TO' )) {
2021-04-20 12:19:46 +02:00
$connect_to = array ( sprintf ( " %s:%d:%s:%d " , $newUrlArray [ 'host' ], $newUrlArray [ 'port' ], $iptocheck , $newUrlArray [ 'port' ]));
//var_dump($newUrlArray);
//var_dump($connect_to);
curl_setopt ( $ch , CURLOPT_CONNECT_TO , $connect_to );
2021-04-19 20:49:31 +02:00
}
2020-10-31 14:32:18 +01:00
}
2013-06-03 17:44:55 +02:00
2020-10-31 14:32:18 +01:00
// Getting response from server
$response = curl_exec ( $ch );
$info = curl_getinfo ( $ch ); // Reading of request must be done after sending request
$http_code = $info [ 'http_code' ];
2021-04-07 23:31:16 +02:00
2020-10-31 14:32:18 +01:00
if ( $followlocation && ( $http_code == 301 || $http_code == 302 || $http_code == 303 || $http_code == 307 )) {
$newUrl = $info [ 'redirect_url' ];
$maxRedirection -- ;
// TODO Use $info['local_ip'] and $info['primary_ip'] ?
continue ;
} else {
$http_code = 0 ;
}
2021-02-23 22:03:23 +01:00
} while ( $http_code );
2020-10-31 14:32:18 +01:00
$request = curl_getinfo ( $ch , CURLINFO_HEADER_OUT ); // Reading of request must be done after sending request
2013-09-18 21:22:24 +02:00
2020-10-31 14:32:18 +01:00
dol_syslog ( " getURLContent request= " . $request );
//dol_syslog("getURLContent response =".response); // This may contains binary data, so we dont output it
dol_syslog ( " getURLContent response size= " . strlen ( $response )); // This may contains binary data, so we dont output it
2017-10-01 18:27:54 +02:00
2020-10-31 14:32:18 +01:00
$rep = array ();
if ( curl_errno ( $ch )) {
2021-04-19 13:52:12 +02:00
// Add keys to $rep
2020-10-31 14:32:18 +01:00
$rep [ 'content' ] = $response ;
// moving to display page to display curl errors
$rep [ 'curl_error_no' ] = curl_errno ( $ch );
$rep [ 'curl_error_msg' ] = curl_error ( $ch );
dol_syslog ( " getURLContent response array is " . join ( ',' , $rep ));
} else {
//$info = curl_getinfo($ch);
2021-04-19 13:52:12 +02:00
// Add keys to $rep
2020-10-31 14:32:18 +01:00
$rep = $info ;
//$rep['header_size']=$info['header_size'];
//$rep['http_code']=$info['http_code'];
dol_syslog ( " getURLContent http_code= " . $rep [ 'http_code' ]);
// Add more keys to $rep
2021-04-19 13:52:12 +02:00
if ( $response ) {
$rep [ 'content' ] = $response ;
}
2020-10-31 14:32:18 +01:00
$rep [ 'curl_error_no' ] = '' ;
$rep [ 'curl_error_msg' ] = '' ;
}
2013-06-03 17:44:55 +02:00
2020-10-31 14:32:18 +01:00
//closing the curl
curl_close ( $ch );
2020-10-27 15:06:16 +01:00
2020-10-31 14:32:18 +01:00
return $rep ;
2013-06-03 17:44:55 +02:00
}
2017-10-01 18:27:54 +02:00
/**
* Function get second level domain name .
* For example : https :// www . abc . mydomain . com / dir / page . html return 'mydomain'
*
* @ param string $url Full URL .
2019-06-19 22:05:38 +02:00
* @ param int $mode 0 = return 'mydomain' , 1 = return 'mydomain.com' , 2 = return 'abc.mydomain.com'
2017-10-01 18:27:54 +02:00
* @ return string Returns domaine name
*/
2019-05-29 10:03:05 +02:00
function getDomainFromURL ( $url , $mode = 0 )
2017-10-01 18:27:54 +02:00
{
2020-04-10 10:59:32 +02:00
$tmpdomain = preg_replace ( '/^https?:\/\//i' , '' , $url ); // Remove http(s)://
$tmpdomain = preg_replace ( '/\/.*$/i' , '' , $tmpdomain ); // Remove part after domain
2020-10-27 21:28:26 +01:00
if ( $mode == 2 ) {
2020-04-10 10:59:32 +02:00
$tmpdomain = preg_replace ( '/^.*\.([^\.]+)\.([^\.]+)\.([^\.]+)$/' , '\1.\2.\3' , $tmpdomain ); // Remove part 'www.' before 'abc.mydomain.com'
2020-05-21 15:05:19 +02:00
} else {
2020-04-10 10:59:32 +02:00
$tmpdomain = preg_replace ( '/^.*\.([^\.]+)\.([^\.]+)$/' , '\1.\2' , $tmpdomain ); // Remove part 'www.abc.' before 'mydomain.com'
2019-06-19 22:05:38 +02:00
}
2020-10-27 21:28:26 +01:00
if ( empty ( $mode )) {
2020-04-10 10:59:32 +02:00
$tmpdomain = preg_replace ( '/\.[^\.]+$/' , '' , $tmpdomain ); // Remove first level domain (.com, .net, ...)
2019-05-28 22:18:22 +02:00
}
2017-12-10 17:59:19 +01:00
2017-10-01 18:27:54 +02:00
return $tmpdomain ;
}
2017-12-10 17:59:19 +01:00
/**
* Function root url from a long url
* For example : https :// www . abc . mydomain . com / dir / page . html return 'https://www.abc.mydomain.com'
* For example : http :// www . abc . mydomain . com / return 'https://www.abc.mydomain.com'
*
* @ param string $url Full URL .
* @ return string Returns root url
*/
function getRootURLFromURL ( $url )
{
2020-04-10 10:59:32 +02:00
$prefix = '' ;
2017-12-10 17:59:19 +01:00
$tmpurl = $url ;
2019-04-22 12:42:46 +02:00
$reg = null ;
2021-02-23 22:03:23 +01:00
if ( preg_match ( '/^(https?:\/\/)/i' , $tmpurl , $reg )) {
$prefix = $reg [ 1 ];
}
2020-04-10 10:59:32 +02:00
$tmpurl = preg_replace ( '/^https?:\/\//i' , '' , $tmpurl ); // Remove http(s)://
$tmpurl = preg_replace ( '/\/.*$/i' , '' , $tmpurl ); // Remove part after domain
2017-12-10 17:59:19 +01:00
return $prefix . $tmpurl ;
}
/**
* Function to remove comments into HTML content
*
* @ param string $content Text content
* @ return string Returns text without HTML comments
*/
function removeHtmlComment ( $content )
{
$content = preg_replace ( '/<!--[^\-]+-->/' , '' , $content );
return $content ;
}