Creating a page from an external url can grab images too.

This commit is contained in:
Laurent Destailleur 2017-10-01 18:27:54 +02:00
parent b3939f12b9
commit f785203c0d
2 changed files with 70 additions and 9 deletions

View File

@ -55,7 +55,7 @@ function getURLContent($url,$postorget='GET',$param='',$followlocation=1,$addhea
curl_setopt($ch, CURLOPT_USERAGENT, 'Dolibarr geturl function');
@curl_setopt($ch, CURLOPT_FOLLOWLOCATION, ($followlocation?true:false)); // We use @ here because this may return warning if safe mode is on or open_basedir is on
if (count($addheaders)) curl_setopt($ch, CURLOPT_HTTPHEADER, $addheaders);
curl_setopt($ch, CURLINFO_HEADER_OUT, true); // To be able to retrieve request header and log it
@ -63,7 +63,7 @@ function getURLContent($url,$postorget='GET',$param='',$followlocation=1,$addhea
// You can force, if supported a version like TLSv1 or TLSv1.2
if (! empty($conf->global->MAIN_CURL_SSLVERSION)) curl_setopt($ch, CURLOPT_SSLVERSION, $conf->global->MAIN_CURL_SSLVERSION);
//curl_setopt($ch, CURLOPT_SSLVERSION, 6); for tls 1.2
//turning off the server and peer verification(TrustManager Concept).
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, FALSE);
@ -82,12 +82,12 @@ function getURLContent($url,$postorget='GET',$param='',$followlocation=1,$addhea
{
curl_setopt($ch, CURLOPT_CUSTOMREQUEST, 'PUT'); // HTTP request is 'PUT'
if (! is_array($param)) parse_str($param, $array_param);
else
else
{
dol_syslog("parameter param must be a string", LOG_WARNING);
$array_param=$param;
}
curl_setopt($ch, CURLOPT_POSTFIELDS, http_build_query($array_param)); // Setting param x=a&y=z as PUT fields
curl_setopt($ch, CURLOPT_POSTFIELDS, http_build_query($array_param)); // Setting param x=a&y=z as PUT fields
}
else if ($postorget == 'PUTALREADYFORMATED')
{
@ -121,7 +121,7 @@ function getURLContent($url,$postorget='GET',$param='',$followlocation=1,$addhea
$response = curl_exec($ch);
$request = curl_getinfo($ch, CURLINFO_HEADER_OUT); // Reading of request must be done after sending request
dol_syslog("getURLContent request=".$request);
dol_syslog("getURLContent response=".$response);
@ -130,7 +130,7 @@ function getURLContent($url,$postorget='GET',$param='',$followlocation=1,$addhea
{
// Ad keys to $rep
$rep['content']=$response;
// moving to display page to display curl errors
$rep['curl_error_no']=curl_errno($ch);
$rep['curl_error_msg']=curl_error($ch);
@ -146,12 +146,12 @@ function getURLContent($url,$postorget='GET',$param='',$followlocation=1,$addhea
//$rep['header_size']=$info['header_size'];
//$rep['http_code']=$info['http_code'];
dol_syslog("getURLContent http_code=".$rep['http_code']);
// Add more keys to $rep
$rep['content']=$response;
$rep['curl_error_no']='';
$rep['curl_error_msg']='';
//closing the curl
curl_close($ch);
}
@ -159,3 +159,19 @@ function getURLContent($url,$postorget='GET',$param='',$followlocation=1,$addhea
return $rep;
}
/**
* Function get second level domain name.
* For example: https://www.abc.mydomain.com/dir/page.html return 'mydomain'
*
* @param string $url Full URL.
* @return string Returns domaine name
*/
function getDomainFromURL($url)
{
$tmpdomain = preg_replace('/^https?:\/\//i', '', $url); // Remove http(s)://
$tmpdomain = preg_replace('/\/.*$/i', '', $tmpdomain); // Remove part after domain
$tmpdomain = preg_replace('/\.[^\.]+$/', '', $tmpdomain); // Remove first level domain (.com, .net, ...)
$tmpdomain = preg_replace('/^[^\.]+\./', '', $tmpdomain); // Remove part www. before domain name
return $tmpdomain;
}

View File

@ -94,6 +94,7 @@ $pageid=GETPOST('pageid', 'int');
$pageref=GETPOST('pageref', 'aZ09');
$action=GETPOST('action','alpha');
if (GETPOST('delete')) { $action='delete'; }
if (GETPOST('preview')) $action='preview';
if (GETPOST('createsite')) { $action='createsite'; }
@ -246,7 +247,11 @@ if ($action == 'add')
$urltograbwithoutdomain = preg_replace('/^https?:\/\/[^\/]+\/?/i', '', $urltograbwithoutdomain);
$objectpage->pageurl = basename($urltograbwithoutdomain);
if (empty($objectpage->pageurl)) $objectpage->pageurl='home';
if (empty($objectpage->pageurl))
{
$tmpdomain = getDomainFromURL($urltograb);
$objectpage->pageurl='home'.$tmpdomain;
}
if (preg_match('/<title>(.*)<\/title>/ims', $head, $regtmp))
{
@ -270,6 +275,46 @@ if ($action == 'add')
$objectpage->content = preg_replace('/^.*<body[^>]*>/ims', '', $objectpage->content);
$objectpage->content = preg_replace('/<\/body[^>]*>.*$/ims', '', $objectpage->content);
$tmp = $objectpage->content;
// Now loop o to fetch all images
preg_match_all('/<img([^\.\/]+)src="([^>"]+)"([^>]*)>/i', $objectpage->content, $regs);
foreach ($regs[0] as $key => $val)
{
$urltograbbis = $urltograb.(preg_match('/^\//', $regs[2][$key])?'':'/').$regs[2][$key];
$linkwithoutdomain = $regs[2][$key];
$filetosave = $conf->medias->multidir_output[$conf->entity].'/image/'.$object->ref.'/'.$objectpage->pageurl.(preg_match('/^\//', $regs[2][$key])?'':'/').$regs[2][$key];
if (preg_match('/^http/', $regs[2][$key]))
{
$urltograbbis = $regs[2][$key];
$linkwithoutdomain = preg_replace('/^https?:\/\/[^\/]+\//i', '', $regs[2][$key]);
$filetosave = $conf->medias->multidir_output[$conf->entity].'/image/'.$object->ref.'/'.$objectpage->pageurl.(preg_match('/^\//', $linkwithoutdomain)?'':'/').$linkwithoutdomain;
}
$tmpgeturl = getURLContent($urltograbbis);
if ($tmpgeturl['curl_error_no'])
{
$error++;
setEventMessages($tmpgeturl['curl_error_msg'], null, 'errors');
$action='create';
}
else
{
dol_mkdir(dirname($filetosave));
$fp = fopen($filetosave, "w");
fputs($fp, $tmpgeturl['content']);
fclose($fp);
if (! empty($conf->global->MAIN_UMASK))
@chmod($file, octdec($conf->global->MAIN_UMASK));
}
$filename = 'image/'.$object->ref.'/'.$objectpage->pageurl.(preg_match('/^\//', $linkwithoutdomain)?'':'/').$linkwithoutdomain;
$tmp = preg_replace('/'.preg_quote($regs[0][$key],'/').'/i', '<img'.$regs[1][$key].'src="'.DOL_URL_ROOT.'/viewimage.php?modulepart=medias&file='.$filename.'"'.$regs[3][$key].'>', $tmp);
}
//print dol_escape_htmltag($tmp);exit;
$objectpage->content = $tmp;
$objectpage->grabbed_from = $urltograb;
}
}