Squiz Matrix  4.12.2
 All Data Structures Namespaces Functions Variables Pages
page_remote_content.inc
1 <?php
18 require_once SQ_CORE_PACKAGE_PATH.'/page/page.inc';
19 require_once SQ_FUDGE_PATH.'/general/www.inc';
20 require_once SQ_FUDGE_PATH.'/general/text.inc';
21 require_once SQ_INCLUDE_PATH.'/general_occasional.inc';
22 
23 // Consider making these constants into attributes of this asset
24 define ('SQ_REMOTE_MAX_REDIRECTS', 10);
25 define ('SQ_REMOTE_SOURCE', 'sq_content_src');
26 define ('SQ_REMOTE_INPUT_FILE_NAMES', 'sq_remote_input_file_names');
27 define ('SQ_REMOTE_SESSION_COOKIE_NAME', 'sq-active-remote-session');
28 
29 
42 {
43 
44  var $_transformation_options = Array(
45  'use_design',
46  'rewrite_urls',
47  'trim_content',
48  'do_tag_replacements',
49  'replace_matrix_keywords',
50  'strip_images',
51  );
52 
53 
60  function __construct($assetid=0)
61  {
62  $this->_ser_attrs = TRUE;
63  parent::__construct($assetid);
64 
65  }//end constructor
66 
67 
77  function printFrontend()
78  {
79  // start performance mode timer
80  $GLOBALS['SQ_SYSTEM']->pm->startTimer($this, 'printFrontend');
81 
82  if (!$this->readAccess()) {
83  $GLOBALS['SQ_SYSTEM']->paintLogin(translate('login'), translate('cannot_access_asset', $this->name));
84  return;
85  }
86 
87  $success = $this->_process();
88 
89  if ($success === TRUE) {
90  $url = $this->_tmp['url'];
91  $this->_tmp['success'][$url] = $success;
92  }
93 
94  if (!$this->_tmp['transform_settings']['replace_matrix_keywords']) {
95  $GLOBALS['SQ_SYSTEM']->setGlobalDefine('SQ_REPLACE_MYSOURCE_LEVEL_KEYWORDS', FALSE);
96  }
97 
98  if ($this->_tmp['transform_settings']['use_design']) {
99  parent::printFrontend();
100  // stop performance mode timer
101  $GLOBALS['SQ_SYSTEM']->pm->stopTimer($this, 'printFrontend');
102  return;
103  } else {
104  // do not use design
105  $headers_list = array_get_index($this->_tmp, 'headers', Array());
106  $headers = array_get_index($headers_list, $url, Array());
107  foreach ($headers as $name => $value) {
108  header("$name: $value");
109  }
110  $content = array_get_index($this->_tmp, 'content', Array());
111  echo array_get_index($content, $url, '');
112  }
113 
114  // stop performance mode timer
115  $GLOBALS['SQ_SYSTEM']->pm->stopTimer($this, 'printFrontend');
116 
117  }//end printFrontend()
118 
119 
132  function printBody()
133  {
134  // start performance mode timer
135  $GLOBALS['SQ_SYSTEM']->pm->startTimer($this, 'printBody');
136 
137  // Encoded URLs look like they've got keywords in them. Disable blanking out of unknown keywords
138  $GLOBALS['SQ_SYSTEM']->setGlobalDefine('SQ_REPLACE_MYSOURCE_LEVEL_KEYWORDS', FALSE);
139 
140  $success = $this->_process();
141  $url = array_get_index($this->_tmp, 'url', '');
142 
143  if ($success === FALSE) {
144  // processing failed
145  if ($this->attr('display_error')) {
146  echo $this->attr('error_message');
147  }
148  } else {
149  // if _process returns null, in some cases headers and content may not be set properly.
150  // So check they are available before trying to use them.
151  if (isset($this->_tmp['headers']) === TRUE && isset($this->_tmp['content']) === TRUE) {
152  $type = array_get_index($this->_tmp['headers'], 'content-type');
153  if (!empty($type) && (0 !== strpos($type, 'text/'))) {
154  // if we have got to this point and are trying to tunnel
155  // binary data, we must be nested and so we'll have to show
156  // a message instead of the content
157  $replacements['remote_url'] = $url;
158  echo replace_keywords($this->attr('binary_message'), $replacements);
159  } else {
160  // everything is good. get the page body.
161  echo $this->_tmp['content'][$url];
162  }
163  }
164  }
165 
166  // stop performance mode timer
167  $GLOBALS['SQ_SYSTEM']->pm->stopTimer($this, 'printBody');
168  return TRUE;
169 
170  }//end printBody()
171 
172 
185  function _process()
186  {
187  $remote_url = trim($this->attr('remote_url'));
188  $remote_source = array_get_index($_REQUEST, SQ_REMOTE_SOURCE);
189 
190  $initial_request = FALSE;
191 
192  $is_post = (isset($_SERVER['REQUEST_METHOD']) && $_SERVER['REQUEST_METHOD'] == 'POST');
193  $url = NULL;
194 
195  if (is_null($remote_source)) {
196  $initial_request = TRUE;
197  $url = $remote_url;
198  // enable all transformation by default
199  foreach ($this->_transformation_options as $v) {
200  $this->_tmp['transform_settings'][$v] = TRUE;
201  }
202  } else {
203  $this->_tmp['transform_settings'] = $this->_decodeHash($remote_source);
204  $url = $this->_tmp['transform_settings']['url'];
205  unset($this->_tmp['transform_settings']['url']);
206  }
207 
208  if (!$url) {
209  trigger_localised_error('CMS0062', E_USER_WARNING);
210  return FALSE;
211  }
212 
213  if (!$this->isUrlAllowed($url)) {
214  trigger_localised_error('CMS0057', E_USER_WARNING, $url);
215  return FALSE;
216  }
217 
218  //Implode the GET array to construct the query string.
219  //Avoid using $_SERVER['query_string'], because, some $_GET vars are passed indirectly.We have to trust $_GET[]
220  $first = true;
221  $query_string = '';
222  foreach($_GET as $key => $value) {
223  // if it's array format, e.g test[0]=1, PHP will convert it to multiple level array, convert it to plain string format
224  if(is_array($value)) {
225  $string_param = http_build_query(array($key => $value));
226  if($first) {
227  $query_string = $string_param;
228  $first = false;
229  }
230  else {
231  $query_string .= '&'.$string_param;
232  }
233  }
234  else {
235  if ($first) {
236  $query_string = $key.'='.urlencode($value);
237  $first = false;
238  } else {
239  $query_string .= '&'.$key.'='.urlencode($value);
240  }
241  }
242  }
243 
244  if (!empty($query_string) && !is_null($remote_source)) {
245  // we're preparing to pass all the query vars to the remote target
246  // remove the query var that contains the reference to the remote target
247  // because it is for our consumption only
248  $source = $is_post ? $remote_source : urlencode($remote_source);
249  $query_string = trim(str_replace(SQ_REMOTE_SOURCE.'='.$source, '', $query_string));
250  }
251 
252  // merge the get vars of current request with those of the source string
253  $url_parts = explode('?', $url);
254  if (isset($url_parts[1])) {
255  // Replace global keywords on the HARD CODED portion of the query string only
256  replace_global_keywords($url_parts[1]);
257  $query_string = $url_parts[1].'&'.$query_string;
258  }
259  $url = $url_parts[0];
260  if (empty($query_string) === FALSE) {
261  $url .= '?'.$query_string;
262  }
263 
264  // If we have already handled this complete URL, mark it as such
265  $processed = array_get_index($this->_tmp, 'processed', Array());
266  if (isset($processed[$url])) {
267  // "NULL" implies that the URL was processed successfully before
268  return $processed[$url] ? NULL : FALSE;
269  }
270 
271  // here we mark that we've attempted processing of this request
272  // if the processing turns out to successfull later, we will set as "TRUE" then
273  $this->_tmp['processed'][$url] = FALSE;
274 
275  $post_data = NULL;
276  $post_is_encoded = FALSE;
277 
278  if ($is_post) {
279  unset($_POST[SQ_REMOTE_SOURCE]);
280 
281  $content_type = array_get_index($_SERVER, 'CONTENT_TYPE', '');
282  if (strpos(strtolower($content_type), 'multipart/form-data') === FALSE) {
283  $post_data = trim(file_get_contents('php://input'));
284 
285  // NOTE that when we trim out SQ_REMOTE_SOURCE varible and value, we also need to
286  // remove '&' delimiter before or after if it exists.
287  $remote_source_str = SQ_REMOTE_SOURCE.'='.urlencode($remote_source);
288  $remote_source_str_len = strlen($remote_source_str);
289  if (($pos = strpos($post_data, $remote_source_str)) !== FALSE) {
290  if ($pos == 0) {
291  // found at the beginning, trim one after
292  $post_data = substr($post_data, $remote_source_str_len + 1);
293  } else if ( ($pos + strlen($remote_source_str)) == strlen($post_data) ) {
294  // found at the end, trim one before
295  $post_data = substr($post_data, 0, strlen($post_data) - $remote_source_str_len - 1);
296  } else {
297  // found in the middle, trim one after
298  $post_data = substr($post_data, 0, $pos).substr($post_data, $pos + $remote_source_str_len + 1);
299  }
300  }
301  $post_is_encoded = TRUE;
302  } else {
303  // NOTE that this is not fully compatible with those systems
304  // where post data is allowed to look like a=1&a=2&a=3
305  // PHP will only see a=3 in POST vars, so we lose a=1,a=2, so we cannot pass it along
306  // that's why when content-type is NOT form-data we use the php://input
307  // the reason we don't always use php://input is because when content type is multipart/form-data it is empty
308  $post_data = $_POST;
309 
310  // if it's array format, e.g test[0]=1, PHP will convert it to multiple level array, convert it to string format for CURL
311  foreach ($post_data as $name => $value) {
312  if(is_array($value)) {
313  $string_param = http_build_query(array($name => $value));
314  $array_param = explode('&', $string_param);
315  foreach ($array_param as $part) {
316  $part_param = explode('=', $part);
317  if(isset($part_param[0]) && isset($part_param[1]))
318  $post_data[urldecode($part_param[0])] = urldecode($part_param[1]);
319  }
320  unset($post_data[$name]);
321  }
322  }
323  }
324 
325  }//end if
326 
327  // support custom post variables if they've been set
328  // NOTE: custom post variables are only sent on INITIAL request
329  $post_variables = $this->attr('post_variables');
330  if ($initial_request && !empty($post_variables)) {
331  $is_post = TRUE;
332  if ($post_is_encoded) {
333  $addon_array = Array();
334  if (!empty($post_data)) $addon_array[] = $post_data;
335  foreach ($post_variables as $index => $variable) {
336  $addon_array[] = urlencode($variable['name']).'='.urlencode(replace_global_keywords($variable['value']));
337  }
338  $post_data = implode('&', $addon_array);
339 
340  } else {
341  foreach ($post_variables as $index => $variable) {
342  $var_name = $variable['name'];
343  $var_value = $variable['value'];
344  $var_value = replace_global_keywords($var_value);
345  if (isset($post_data[$var_name])) {
346  if (!is_array($post_data[$var_name])) {
347  $post_data[$var_name] = Array($post_data[$var_name], $var_value);
348  } else {
349  $post_data[$var_name][] = $var_value;
350  }
351  } else {
352  $post_data[$var_name] = $var_value;
353  }
354  }
355  }
356  }
357 
358  // restore cookie from user's session storage
359  // since we really want to make sure Matrix user got individual cookie jar for the remote content
360  if (!is_dir($this->data_path)) {
361  create_directory($this->data_path);
362  }
363  $sessionid = session_id();
364  $cookie_filename = $this->data_path."/cookies_".$sessionid;
365  if(isset($_SESSION['SQ_REMOTE_CONTENT_COOKIE'][$this->id])) {
366  string_to_file($_SESSION['SQ_REMOTE_CONTENT_COOKIE'][$this->id], $cookie_filename);
367  }
368 
369  $options = array(
370  'CONNECTTIMEOUT' => $this->attr('timeout'),
371  'COOKIEFILE' => $cookie_filename,
372  'COOKIEJAR' => $cookie_filename,
373  'TIMEOUT' => $this->attr('timeout'),
374  'USERAGENT' => SQ_SYSTEM_LONG_NAME,
375  'RETURNTRANSFER' => 1,
376  );
377 
378  // set HTTP authentication
379  $user = $this->attr('username');
380  if (!empty($user)) {
381  $options['http_authentication']['username'] = $user;
382  $options['http_authentication']['password'] = $this->attr('password');
383  }
384 
385  if ($this->attr('allow_redirect')) {
386  $options['FOLLOWLOCATION'] = true;
387  $options['MAXREDIRS'] = SQ_REMOTE_MAX_REDIRECTS;
388  }
389 
390  setcookie(SQ_REMOTE_SESSION_COOKIE_NAME, 1);
391 
392  if ($is_post) {
393  $hidden_field_file_name = $this->getPrefix().'_'.SQ_REMOTE_INPUT_FILE_NAMES;
394  if (!empty($_POST[$hidden_field_file_name])) {
395  $input_file_names = unserialize($_POST[$hidden_field_file_name]);
396  } else {
397  $input_file_names = Array();
398  }
399 
400  // Only send files to the remote URL if those files are in the remote content page
401  if (!empty($input_file_names)) {
402  foreach ($_FILES as $name => $details) {
403  if (in_array($name, $input_file_names)) {
404  if (is_array($details['tmp_name'])) {
405  foreach ($details['tmp_name'] as $i => $tn) {
406  if ($details['error'][$i] != UPLOAD_ERR_OK) continue;
407  $new_name = SQ_TEMP_PATH.'/'.basename($details['name'][$i]);
408  if (move_uploaded_file($tn, $new_name)) {
409  $post_data[$name.'['.$i.']'] = '@'.$new_name;
410  }
411  }
412  } else {
413  if ($details['error'] != UPLOAD_ERR_OK) continue;
414  $new_name = SQ_TEMP_PATH.'/'.basename($details['name']);
415  if (move_uploaded_file($details['tmp_name'], $new_name)) {
416  $post_data[$name] = '@'.$new_name;
417  }
418  }
419  }//end if
420  }//end for
421  }//end if
422 
423  // a workaround for curl's inability to handle empty post data
424  if (empty($post_data) && empty($files)) {
425  if ($post_is_encoded) {
426  $random = md5(rand());
427  $post_data = $random.'='.$random;
428  }
429  }
430 
431  $options['POST'] = true;
432  $options['POSTFIELDS'] = $post_data;
433  }
434 
435  // Make sure we're not trying to include (or redirect) to ourselves.
436  $myUrls = $this->getURLs();
437  foreach ($myUrls as $urlInfo) {
438  foreach (array('http', 'https') as $schema) {
439  if ($urlInfo[$schema] == 0) {
440  continue;
441  }
442  $fullUrl = $schema.'://'.$urlInfo['url'];
443  if ($fullUrl == $url) {
444  trigger_localised_error('CMS0111', E_USER_WARNING);
445  return FALSE;
446  }
447  }
448  }
449 
450  $options['ssl_verifypeer']=$this->attr('verify_ssl');
451  $result = fetch_url($url, $options);
452 
453  // save cookie content to session
454  if(is_file($cookie_filename)) {
455  $_SESSION['SQ_REMOTE_CONTENT_COOKIE'][$this->id] = file_get_contents($cookie_filename);
456  unlink($cookie_filename);
457  }
458 
459  if ($result['errornumber'] != 0) {
460  trigger_localised_error('CMS0063', E_USER_WARNING, $url, $result['errorstring']);
461  return FALSE;
462  }
463 
464  if (preg_match('/^[45].*?/', $result['curlinfo']['http_code'])) {
465  trigger_localised_error('CMS0064', E_USER_WARNING, $result['curlinfo']['http_code'], $url);
466  return FALSE;
467  }
468 
469  $response = $result['response'];
470 
471  $this->_tmp['url'] = $result['curlinfo']['url'];
472 
473  // check if we were redirected to the bad destination
474  if ($this->_tmp['url'] != $url) {
475  if (!$this->isUrlAllowed($this->_tmp['url'])) {
476  trigger_localised_error('CMS0058', E_USER_WARNING, $this->_tmp['url'], $url);
477  return FALSE;
478  }
479  }
480 
481  $keep_headers = array();
482  foreach (array('content_type', 'content_disposition') as $_header) {
483  if (isset($result['curlinfo'][$_header]) === TRUE) {
484  $keep_headers[str_replace('_', '-', $_header)] = $result['curlinfo'][$_header];
485  }
486  }
487  // prepare temporary variables for output
488  $this->_tmp['headers'][$this->_tmp['url']] = $keep_headers;
489  $this->_tmp['content'][$this->_tmp['url']] = $response;
490 
491  // wipe the response because we won't use it again and it could be too big to just ignore
492 
493  if ($this->_shouldPassStraightThrough($this->_tmp['headers'][$this->_tmp['url']])) {
494  foreach ($this->_tmp['transform_settings'] as $i => $v) {
495  $this->_tmp['transform_settings'][$i] = FALSE;
496  }
497  } else {
498  $this->_transformContent($this->_tmp['content'][$this->_tmp['url']]);
499  }
500 
501  // Mark the successful processing of this url
502  $this->_tmp['processed'][$url] = TRUE;
503 
504  return TRUE;
505 
506  }//end _process()
507 
508 
517  function _transformContent(&$content)
518  {
519  // Get the content between the selected start and end tags if necessary
520  if ($this->_tmp['transform_settings']['trim_content']) {
521  $this->_trimContent($content);
522  }
523 
524  // Replace tags
525  if ($this->_tmp['transform_settings']['do_tag_replacements']) {
526  $this->_doTagReplacements($content);
527  }
528 
529  // Strip the images if necessary
530  if ($this->attr('strip_images') && $this->_tmp['transform_settings']['strip_images']) {
531  $this->_stripAllImages($content);
532  }//end if
533 
534  // Rewrite URLs
535  if ($this->_tmp['transform_settings']['rewrite_urls']) {
536  $this->_rewriteURLs($content);
537  }
538 
539  // Replace keywords
540  if ($this->_tmp['transform_settings']['replace_matrix_keywords']) {
541  preg_match_all('/%(\w+)%/i', $content, $matches);
542  foreach ($matches[1] as $key => $keyword) {
543  $rep = $this->getKeywordReplacement($keyword);
544  $content = str_replace($matches[0][$key], $rep, $content);
545  }
546  }
547 
548  // Store the input file type's names if they exist
549  $this->_addInputFileNames($content);
550 
551  }//end _transformContent()
552 
553 
554  /*
555  * Add the names of file inputs (<input type="file">) into the HTML content under an hidden field SQ_REMOTE_INPUT_FILE_NAMES
556  *
557  * @param string &$content The HTML content this remote content page will display
558  *
559  */
560  function _addInputFileNames(&$content) {
561  $pattern = '/<input\s+[^>]*type\s*=\s*(?:"|\')?file(?:"|\')?[^>]*>/i';
562  $file_names = Array();
563  if (preg_match_all($pattern, $content, $matches)) {
564  //the following pattern can not get names with space inside like name="hello world" => it only extracts "hello"
565  $name_pattern = '/\s+name\s*=\s*(?:"|\')?([^>"\'\s]+)/i';
566  foreach($matches[0] as $match) {
567  if (preg_match($name_pattern, $match, $name_matches)) {
568  $file_names[] = $name_matches[1];
569  }
570  }
571  }
572  if (!empty($file_names)) {
573  $serialized_names = serialize($file_names);
574  require_once SQ_LIB_PATH.'/html_form/html_form.inc';
575  ob_start();
576  hidden_field($this->getPrefix().'_'.SQ_REMOTE_INPUT_FILE_NAMES, $serialized_names);
577  $hidden_field = ob_get_clean();
578  //put the hidden field right after the first input file type
579  $content = str_replace($matches[0][0], $matches[0][0].$hidden_field, $content);
580  }
581 
582  }//end _addInputFileNames()
583 
592  function _shouldPassStraightThrough($headers)
593  {
594  // we shouldn't interfere with attachments
595  $disposition = array_get_index($headers, 'content-disposition');
596  if (!is_null($disposition) && strpos($disposition, 'attachment') !== FALSE) {
597  return TRUE;
598  }
599 
600  // we shouldn't interfere with non-text content
601  // (this actually includes javascript in some cases, but that's OK)
602  $content_type = array_get_index($headers, 'content-type');
603  if (!empty($content_type) && (strpos($content_type, 'text/') === FALSE)) {
604  return TRUE;
605  }
606 
607  return FALSE;
608 
609  }//end _shouldPassStraightThrough()
610 
611 
612 //-- CONTENT TRANSFORMATION FNS & THEIR HELPERS --//
613 
614 
623  function _trimContent(&$response_body)
624  {
625  // prepare the tags
626  $start_tag = preg_quote($this->attr('start_tag'), '/');
627  $end_tag = preg_quote($this->attr('end_tag'), '/');
628  $multiple = $this->attr('multi_occurence');
629  // strip anything outside of the special tags
630  // if the start or stop tags are specified, we scan the page for the necessary content
631  // otherwise the page is good as it is, no scanning necessary
632  if (!empty($start_tag) || !empty($end_tag)) {
633  if ($this->attr('include_tags')) {
634  $pattern = "/($start_tag.*$end_tag)/s";
635  } else {
636  $pattern = "/$start_tag(.*)$end_tag/s";
637  }
638 
639  // note that we can find more than one section identified by the tags
640  // all the sections joined together
641  preg_match_all($pattern, $response_body, $matches);
642  if (count($matches[1])) {
643  $response_body = '';
644  if ($multiple) {
645  foreach ($matches[1] as $match) {
646  $response_body .= $match;
647  }
648  } else {
649  $response_body .= $matches[1][0];
650  }
651  } else if ($this->attr('no_tag_clear')) {
652  // we might be configured to clear the contents if we don't find a match
653  $response_body = '';
654  }
655  $matches = NULL;
656  }
657 
658  }//end _trimContent()
659 
660 
669  function _rewriteURLs(&$content)
670  {
671  $replacement_url = $this->getURL().'?'.SQ_REMOTE_SOURCE.'=';
672  $fetch_url = $this->_tmp['url'];
673 
674  // get all info about the remote URL
675  $parsed_fetch_url = parse_url($fetch_url);
676  $subjects = Array();
677  $replacements = Array();
678 
680  $matches = Array();
681  $patterns = Array();
682 
683  // scripts and linked CSS or JavaScript
684  $patterns[] = '/(<(?:script|link)\s+.*?(?:src|href)\s*=\s*[\'"]*)\s*([^\'" >]+)([\'"> ]?[^<]*>)/i';
685 
686  // CSS url definitions for scripts
687  $patterns[] = '/(@import\s+url\s*\([\'"]?\s*)([^\'" ]+)([\'"]?\s*\))/i';
688  $patterns[] = '/(@import\s+[\'"]\s*)([^\'" ]+)([\'"])/i';
689 
690  // find whatever matches the above patterns
691  foreach ($patterns as $pattern) {
692  preg_match_all($pattern, $content, $matches_local, PREG_SET_ORDER);
693  $matches = array_merge($matches, $matches_local);
694  }
695  // local cleanup
696  $matches_local = NULL;
697 
698  // fix these matches up
699  foreach ($matches as $data) {
700  $subject = $data[0];
701  $prefix = $data[1];
702  $url = $data[2];
703  $postfix = $data[3];
704 
705  // if conversion is unnecessary, ignore this url
706  if (!$this->_makeFullURL($parsed_fetch_url, $url)) {
707  continue;
708  }
709 
710  // if scripts need to be tunneled, prepare a new source url
711  if ($this->attr('tunnel_scripts') && $this->isUrlAllowed($url)) {
712  $encoded_url = $this->_encodeHash($url, Array('rewrite_urls'));
713  $url = $replacement_url.$encoded_url;
714  }
715 
716  $subjects[] = $subject;
717  $replacements[] = $prefix.$url.$postfix;
718 
719  }
721 
722 
724  $matches = Array();
725  // objects
726  $pattern = '/(<(?:embed)\s+.*?(?:src)\s*=\s*[\'"]*)\s*([^\'" >]+)([\'"> ]?[^<]*>)/i';
727 
728  // find whatever matches the above patterns
729  preg_match_all($pattern, $content, $matches, PREG_SET_ORDER);
730 
731  // fix these matches up
732  foreach ($matches as $data) {
733  $subject = $data[0];
734  $prefix = $data[1];
735  $url = $data[2];
736  $postfix = $data[3];
737 
738  // make full url, ignore invalid urls
739  if (!$this->_makeFullURL($parsed_fetch_url, $url)) {
740  continue;
741  }
742 
743  // if media need to be tunneled, prepare a new source url
744  if ($this->attr('tunnel_media') && $this->isUrlAllowed($url)) {
745  $encoded_url = $this->_encodeHash($url);
746  $url = $replacement_url.$encoded_url;
747  }
748 
749  $subjects[] = $subject;
750  $replacements[] = $prefix.$url.$postfix;
751  }
752 
753 
754  // FLASH MOVIE
755  $matches = Array();
756 
757  // object parameters (flash)
758  $pattern = '/(<(?:param)\s+.*?(?:value)\s*=\s*[\'"]*)\s*([^\'" >]+)([\'"> ]?[^<]*>)/i';
759 
760  preg_match_all($pattern, $content, $matches, PREG_SET_ORDER);
761 
762  // fix these matches up
763  foreach ($matches as $data) {
764  $subject = $data[0];
765  $prefix = $data[1];
766  $url = $data[2];
767  $postfix = $data[3];
768 
769  // find the name of the subject
770  $pattern = '/name\s*=\s*[\'"]*\s*([^\'" >]+)[\'"> ]+/i';
771 
772  preg_match($pattern, $subject, $name_matches);
773  $param_name = $name_matches['1'];
774 
775  // only look at parameter whose name is movie (for flash)
776  if ($param_name != 'movie') continue;
777 
778  // make full url, ignore invalid urls
779  if (!$this->_makeFullURL($parsed_fetch_url, $url)) {
780  continue;
781  }
782 
783  // if media need to be tunneled, prepare a new source url
784  if ($this->attr('tunnel_media') && $this->isUrlAllowed($url)) {
785  $encoded_url = $this->_encodeHash($url);
786  $url = $replacement_url.$encoded_url;
787  }
788 
789  $subjects[] = $subject;
790  $replacements[] = $prefix.$url.$postfix;
791  }
792  // local cleanup
793  $name_matches = NULL;
794 
796 
797 
799  $matches = Array();
800  $patterns = Array();
801 
802  // prepare the image patterns
803  $patterns[] = '/(<(?:img)\s+.*?(?:src)\s*=\s*[\'"]*)\s*([^\'" <>]+)([\'"> ]?[^<>]*>)/i';
804 
805  $patterns[] = '/(background\s*=\s*[\'"]?)\s*([^\'"<> ]+)([\'" ]?[^<]*>)/i';
806 
807  $patterns[] = '/((?:background.*?)url\s*\([\'"]?\s*)([^\'"; ]+)([\'"]?\s*\))/i';
808  $patterns[] = '/((?:list-style-image.*?)url\s*\([\'"]?\s*)([^\'"; ]+)([\'"]?\s*\))/i';
809 
810  $patterns[] = '/(<(?:input)\s+.*?(?:src)\s*=\s*[\'"]+)\s*([^\'" >]+)([\'"> ]?[^<>]*>)/i';
811 
812  // find whatever matches the above patterns
813  foreach ($patterns as $pattern) {
814  preg_match_all($pattern, $content, $matches_local, PREG_SET_ORDER);
815  $matches = array_merge($matches, $matches_local);
816  }
817  // local cleanup
818  $matches_local = NULL;
819 
820  foreach ($matches as $data) {
821  $subject = $data[0];
822  $prefix = $data[1];
823  $url = $data[2];
824  $postfix = $data[3];
825 
826  // make full url, ignore invalid urls
827  if (!$this->_makeFullURL($parsed_fetch_url, $url)) {
828  continue;
829  }
830 
831  // if images need to be tunneled, prepare a new source url
832  if ($this->attr('tunnel_images') && $this->isUrlAllowed($url)) {
833  $encoded_url = $this->_encodeHash($url);
834  $url = $replacement_url.$encoded_url;
835  }
836 
837  $subjects[] = $subject;
838  $replacements[] = $prefix.$url.$postfix;
839 
840  }
842 
843 
845  // prepare for the next round of url replacements
846  $matches = Array();
847  $patterns = Array();
848 
849  // PATTERNS for urls that might need to be rewritten to point back to remote content
850  // links and frames except mailto and javascript and tel (URI Schemes)
851  $patterns[] = '/(<(?:(?:a(?:rea)?[^>]*?href)|(?:i?frame[^>]*?src))\s*=\s*[\'"]?)\s*((?:(?![\#]|mailto|javascript|tel))[^\'"> #]+)([\'"> #]?[^<]*>)/i';
852 
853  // this pattern is a special case. we don't generally look at JavaScript
854  // because that is a whole new territory
855  // this one was added for compatibility with MySource v2 ecom module
856  $patterns[] = '/(<input[^>]*?onclick\s*=\s*[\'"]location=\')\s*([^\'"> ]+)([\'"> ]?[^<]*>)/i';
857 
858  // find whatever matches the above patterns
859  foreach ($patterns as $pattern) {
860  preg_match_all($pattern, $content, $matches_local, PREG_SET_ORDER);
861  $matches = array_merge($matches, $matches_local);
862  }
863 
864  // local cleanup
865  $matches_local = NULL;
866  $load_links = $this->attr('load_links');
867 
868  foreach ($matches as $data) {
869  $subject = $data[0];
870  $prefix = $data[1];
871  $url = $data[2];
872  $postfix = $data[3];
873 
874  // make full url, ignore invalid urls
875  if (!$this->_makeFullURL($parsed_fetch_url, $url)) {
876  continue;
877  }
878 
879  // if urls need to point back to us, prepare a new url
880  if ($load_links && $this->isUrlAllowed($url)) {
881  $encoded_url = $this->_encodeHash($url, Array('all'));
882  $url = $replacement_url.$encoded_url;
883  }
884 
885  $subjects[] = $subject;
886  $replacements[] = $prefix.$url.$postfix;
887 
888  }
890 
891 
893  // prepare for the next round of url replacements
894  $matches = Array();
895  $patterns = Array();
896 
897  // forms. NOTE: this captures even those actions that are empty
898  $patterns[] = '/(<(?:form[^>]*?action)\s*=\s*[\'\"]?)\s*([^\'\"> ]*)([\'\"> ]?[^<]*>)/i';
899 
900  // find whatever matches the above patterns
901  foreach ($patterns as $pattern) {
902  preg_match_all($pattern, $content, $matches_local, PREG_SET_ORDER);
903  $matches = array_merge($matches, $matches_local);
904  }
905  // local cleanup
906  $matches_local = NULL;
907 
908  foreach ($matches as $data) {
909  $subject = $data[0];
910  $prefix = $data[1];
911  $url = $data[2];
912  $postfix = $data[3];
913 
914  // make full url, ignore invalid urls
915  if (!$this->_makeFullURL($parsed_fetch_url, $url)) {
916  continue;
917  }
918 
919  // if urls need to point back to us, prepare a new url
920  if ($load_links && $this->isUrlAllowed($url)) {
921  // forms require the parameters to be set to the hidden fields for proper functioning
922  $postfix .= '<input type="hidden" name="'.SQ_REMOTE_SOURCE.'" value="'.$this->_encodeHash($url, Array('all')).'" />';
923 
924  $encoded_url = $this->_encodeHash($url, Array('all'));
925  $url = $replacement_url.$encoded_url;
926  }
927 
928  $subjects[] = $subject;
929  $replacements[] = $prefix.$url.$postfix;
930 
931  }
933 
934  // finally, we replace all the urls with the re-written ones.
935  $content = str_replace($subjects, $replacements, $content);
936 
937  }//end _rewriteURLs()
938 
939 
948  function _stripAllImages(&$content)
949  {
950  // replace HTML images with their ALT tags
951  $pattern = '/(<(?:img)[^>]*?>)/i';
952 
953  $content = preg_replace_callback(
954  $pattern,
955  create_function(
956  '$matches',
957  'return Page_Remote_Content::_imageStrip($matches[0]);'
958  ),
959  $content
960  );
961 
962  // remove HTML background images
963  $pattern = '/(background\s*=\s*[\'\"]?\s*[^\'\" <>]+[\'\"]?)/i';
964  $content = preg_replace($pattern, '', $content);
965 
966  // strip CSS backgrounds
967  $pattern = '/(background.*?)(url\(.*?\))/i';
968  $content = preg_replace($pattern, '\$1', $content);
969 
970  // change Form Image buttons with regular buttons
971  $pattern = '/(<(?:input)[^>]*type=["\' ]image["\' ][^>]*?>)/i';
972 
973  $content = preg_replace_callback(
974  $pattern,
975  create_function(
976  '$matches',
977  'return Page_Remote_Content::_imageButtonReplace($matches[0]);'
978  ),
979  $content
980  );
981 
982  }//end _stripAllImages()
983 
984 
993  function _doTagReplacements(&$content)
994  {
995  $tag_replacements = $this->attr('tag_replacements');
996  if (!empty($tag_replacements)) {
997  foreach ($tag_replacements as $tag => $replace_info) {
998  if ($replace_info['remove_content']) {
999  $pattern = '/(<(?:'.$tag.')(?:\s+[^>]*?)?>[^<]*)/i';
1000  } else {
1001  $pattern = '/(<(?:'.$tag.')(?:\s+[^>]*?)?>)/i';
1002  }
1003  $content = preg_replace($pattern, $replace_info['start_tag'], $content);
1004  $pattern = '/(<(?:\/'.$tag.')[^>]*?>)/i';
1005  $content = preg_replace($pattern, $replace_info['end_tag'], $content);
1006  }
1007  }
1008 
1009  }//end _doTagReplacements()
1010 
1011 
1021  function _imageButtonReplace($input_string, $default_button_name='Submit Button')
1022  {
1023  $alt_pattern = '/(?:alt\s*=[\'" ]([^\'"><]+)[\'" ])/i';
1024  $type_pattern = '/(type\s*=[\'" ][^\'"><]+[\'" ])/i';
1025 
1026  if (preg_match($alt_pattern, $input_string, $matches)) {
1027  $button_name = ucfirst($matches[1]);
1028  } else {
1029  $button_name = $default_button_name;
1030  }
1031 
1032  $type_replacement = ' type="Submit" value="'.$button_name.'" ';
1033 
1034  $result = preg_replace($type_pattern, $type_replacement, $input_string);
1035 
1036  return $result;
1037 
1038  }//end _imageButtonReplace()
1039 
1040 
1052  function _imageStrip($image_string, $default_image_name='Image')
1053  {
1054  $alt_pattern = '/(?:alt\s*=[\'" ]([^\'"><]+)[\'" ])/i';
1055 
1056  if (preg_match($alt_pattern, $image_string, $matches)) {
1057  $image_name = ucfirst($matches[1]);
1058  } else {
1059  $image_name = $default_image_name;
1060  }
1061 
1062  return "[$image_name]";
1063 
1064  }//end _imageStrip()
1065 
1066 
1079  function _makeFullURL($parsed_fetch_url, &$url)
1080  {
1081  // reject invalid urls
1082  if (!isset($url) || (substr($url, -3) == '://' )) {
1083  return FALSE;
1084  }
1085 
1086  if (substr($url,0,2) == '//') {
1087  $url = $parsed_fetch_url['scheme'].':'.$url;
1088  }
1089 
1090  // do nothing for absolute urls
1091  if (preg_match('/^[a-z0-9]+:\/\//i', $url)) {
1092  return TRUE;
1093  }
1094 
1095  // start building full URL
1096  $full_url = $parsed_fetch_url['scheme'].'://'.$parsed_fetch_url['host'];
1097  if (isset($parsed_fetch_url['port'])) {
1098  $full_url .= ':'.$parsed_fetch_url['port'];
1099  }
1100 
1101  if (strlen($url) && $url{0} == '/') {
1102  $final_path = $url;
1103  } else {
1104  $path = array_get_index($parsed_fetch_url,'path','/');
1105  if (substr($path,-1) != '/' && !preg_match('/^\?/', $url)) {
1106  $path = dirname($path);
1107  if (strlen($path) > 1) $path .= '/';
1108  }
1109 
1110  $final_path = $path.$url;
1111  }
1112 
1113  $full_url .= $this->_makeProperPath($final_path);
1114 
1115  $url = $full_url;
1116 
1117  return TRUE;
1118 
1119  }//end _makeFullURL()
1120 
1121 
1138  function _makeProperPath($path='')
1139  {
1140  if (empty($path))
1141  return '';
1142 
1143  $root = '';
1144  $path_components = explode('/',$path);
1145 
1146  if (empty($path_components[0])) {
1147  $root = '/';
1148  unset($path_components[0]);
1149  }
1150 
1151  $stack = Array();
1152 
1153  foreach ($path_components as $component) {
1154  switch ($component) {
1155  case '..':
1156  if (!empty($stack)) array_pop($stack);
1157  break;
1158 
1159  case '.':
1160  case '':
1161  continue;
1162  break;
1163 
1164  default:
1165  array_push($stack, $component);
1166  }
1167  }
1168 
1169  $new_path = implode('/', $stack);
1170 
1171  return $root.$new_path;
1172 
1173  }//end _makeProperPath()
1174 
1175 
1176 //-- GENERAL HELPERS --//
1177 
1178 
1190  function _encodeHash($url, $transformations=Array())
1191  {
1192  if (empty($url)) {
1193  trigger_localised_error('CMS0065', E_USER_WARNING);
1194  return '';
1195  }
1196 
1197  // Very conservative fix for '&amp;'. decode HTML entity only if '&amp;' is found.
1198  if (strpos($url, '&amp;') !== FALSE) {
1199  $components = Array('url' => html_entity_decode($url));
1200  } else {
1201  $components = Array('url' => $url);
1202  }
1203 
1204  foreach ($transformations as $arg) {
1205  $components[$arg] = 1;
1206  }
1207  return urlencode('+'.base64_encode(make_raw_post_data($components)));
1208 
1209  }//end _encodeHash()
1210 
1211 
1220  function _decodeHash($hash)
1221  {
1222  // this is a micro hack to detect if PHP has prematurely decoded the url
1223  if ($hash{0} != '+') $hash = urldecode($hash);
1224 
1225  $hash = base64_decode(substr($hash, 1));
1226  $components = Array();
1227  parse_str(html_entity_decode($hash), $components);
1228  if (empty($components['url'])) {
1229  trigger_localised_error('CMS0065', E_USER_WARNING);
1230  }
1231  // insert any missing options (presumed false) and expand the 'all' option
1232  foreach ($this->_transformation_options as $i => $v) {
1233  $components[$v] = !empty($components[$v]) || !empty($components['all']);
1234  }
1235  return $components;
1236 
1237  }//end _decodeHash()
1238 
1239 
1248  function isUrlAllowed($url)
1249  {
1250  if ($url == $this->attr('remote_url')) return TRUE;
1251 
1252  $options = $this->attr('url_rewriting_options');
1253  if ($options['all']) return TRUE;
1254 
1255  if (!isset($this->_tmp['_white_list_'])) {
1256  $url_parts = parse_url($this->attr('remote_url'));
1257  $host = $url_parts['host'];
1258  $path = '';
1259  if (isset($url_parts['path']) === TRUE) {
1260  $path = $url_parts['path'];
1261  }
1262 
1263  $this->_tmp['_white_list_'] = Array();
1264  if ($options['remote_domain']) {
1265  $this->_tmp['_white_list_'][] = strtolower($host);
1266  }
1267  if ($options['remote_url']) {
1268  $this->_tmp['_white_list_'][] = strtolower($host.$path);
1269  }
1270  if ($options['white_list']) {
1271  $this->_tmp['_white_list_'] = array_merge($this->attr('url_white_list'), $this->_tmp['_white_list_']);
1272  }
1273  }
1274 
1275  if (empty($this->_tmp['_white_list_'])) return FALSE;
1276 
1277  $url_parts = parse_url($url);
1278  $url = $url_parts['host'];
1279  if (isset($url_parts['path']) === TRUE) {
1280  $url .= $url_parts['path'];
1281  }
1282  $url = strtolower($url);
1283 
1284  foreach ($this->_tmp['_white_list_'] as $white_url) {
1285  $white_url_len = strlen(trim($white_url));
1286  if ($white_url_len > strlen($url)) {
1287  continue;
1288  } else if (strtolower(trim($white_url)) == substr($url, 0, $white_url_len)) {
1289  return TRUE;
1290  }
1291  }
1292 
1293  return FALSE;
1294 
1295  }//end isUrlAllowed()
1296 
1297 
1298 }//end class
1299 
1300 ?>