Squiz Matrix  4.12.2
 All Data Structures Namespaces Functions Variables Pages
html_tidy.inc
1 <?php
18 global $ROOT_PATH;
19 include_once($ROOT_PATH.'wysiwyg_plugin.inc');
20 require_once SQ_DATA_PATH.'/private/conf/tools.inc';
21 
22 
37 class HTML_Tidy extends Wysiwyg_Plugin
38 {
39 
46  var $_new_inline_tags = Array();
47 
53  var $_show_in_toolbar = FALSE;
54 
55 
61  var $htmltidy_status = 'pass';
62 
63  var $htmltidy_errors = NULL;
64 
65  // HTML versions and their respective doctype declaration
66  var $html_standards = Array(
67  'HTML_4.01_Transitional' => '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">',
68  'HTML_4.01_Strict' => '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> ',
69  'HTML_4.01_Frameset' => '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Frameset//EN" "http://www.w3.org/TR/html4/frameset.dtd">',
70  'XHTML_1.0_Strict' => '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">',
71  'XHTML_1.0_Transitional' => '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">',
72  'XHTML_1.0_Frameset' => '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Frameset//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd">',
73  'XHTML_1.1' => '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">',
74  );
75 
76 
77 
84  function HTML_Tidy()
85  {
86 
87  }//end constructor
88 
89 
98  function new_inline_tag($tag_name)
99  {
100  if (in_array($tag_name, $this->_new_inline_tags)) {
101  return TRUE;
102  }
103  $this->_new_inline_tags[] = $tag_name;
104  return TRUE;
105 
106  }//end new_inline_tag()
107 
108 
117  function process(&$html)
118  {
119  $this->htmltidy_status = 'pass';
120 
121  if (!file_exists(SQ_TOOL_HTML_TIDY_PATH)) {
122  // Disable HTML Tidy if path doesn't exist (of course!)
123  $this->htmltidy_status = 'disabled';
124  } else if (empty($html)) {
125  // skip as we cannot do tidy on empty content
126  return;
127 
128  } else {
130  // UNSUPPORTED CHARACTERS //
132 
133  // Remove the following unsupported character codes that do not get replaced
134  // back to their original characters through the encoding process (mostly MS Word chars)
135  $search = Array('%u2018', '%u2019', '%u2013', '%u2014', '%u2026', '%u201C', '%u201D');
136  $replace = Array('&lsquo;', '&rsquo;', '&ndash;', '&mdash;', '&hellip;', '&ldquo;', '&rdquo;');
137  $html = str_replace($search, $replace, $html);
138 
139  // If the page is on a character set other than UTF-8,
140  // change these MS Word common characters, otherwise HTML Tidy throws errors
141  // because the UTF8 characters do not exist on other character sets like Western Euro ISO
142  if (SQ_CONF_DEFAULT_CHARACTER_SET != 'utf-8') {
143  $html = str_replace(chr(145), '&lsquo;', $html); // left single quote
144  $html = str_replace(chr(146), '&rsquo;', $html); // right single quote
145  $html = str_replace(chr(147), '&ldquo;', $html); // left double quote
146  $html = str_replace(chr(148), '&rdquo;', $html); // right double quote
147  $html = str_replace(chr(149), '&ndash;', $html); // bullet
148  $html = str_replace(chr(150), '&ndash;', $html); // endash
149  $html = str_replace(chr(151), '&mdash;', $html); // emdash
150  }//end if
151 
152  // replace unicode's URL '%u' character entities with their HTML entity equivs
153  $html = preg_replace('/%u([0-9A-Fa-f]{4})/', '&#x$1;', $html);
154 
156  // MS WORD CLEANUP //
158 
159  // Remove weird tags MS Word leaves in its output
160  $html = preg_replace('|<([\w]+)([^>]+?)class="?mso[^">]+"?([^>]+)?>|is', '<\\1>', $html);
161  $html = preg_replace('|<([\w]+)([^>]+?)style="([^"]+)?mso([^"]+)?"([^>]+)?>|is', '<\\1>', $html);
162  $html = preg_replace('|<\/?\w+:[^>]*>|is', '', $html);
163  $html = preg_replace('|<font[^>]+>(.+?)</font>|is', '\\1', $html);
164  $html = preg_replace('|<span([^>]+?)lang=[^\s]+([^>]+?)xml:lang="[^\s]+">(.+?)</span>|is', '\\3', $html);
165  $html = preg_replace('|<\?xml[^>]+microsoft[^>]+\?>|is', '', $html);
166  $html = preg_replace('|<\/?\w+:[^>]*>|is', '', $html);
167  $html = preg_replace('|<\\?\??xml[^>]>|is', '', $html);
168 
169  // Remove javascript tags
170  $html = preg_replace('|<script\b[^>]*>(.*?)</script>|is', '', $html);
171 
173  // HTML TIDY //
175 
176  // If HTML Tidy is enabled, let's rock'n'roll
177  if ($this->check_usable() === TRUE) {
178  require SQ_FUDGE_PATH.'/standards_lists/character_sets.inc';
179  $tidy_char_set = array_get_index($standards_lists_tidy_char_sets, SQ_CONF_DEFAULT_CHARACTER_SET);
180 
181  // tidy the HTML produced using the PHP5 Tidy
182  $path_to_tidy = SQ_TOOL_HTML_TIDY_PATH;
183 
184  $config = Array (
185  '--preserve-entities' => TRUE,
186  '--show-body-only' => TRUE,
187  '--show-errors' => 1,
188  '--show-warnings' => TRUE,
189  '--wrap' => FALSE,
190  '--word-2000' => TRUE,
191  '--force-output' => TRUE,
192  '--logical-emphasis' => TRUE,
193  '--char-encoding' => $tidy_char_set,
194  '-access' => SQ_TOOL_HTML_TIDY_ACCESSIBILITY_LEVEL,
195  );
196 
197  // add any inline tags that other plugins may use (possibly for encapsulating plugin specific data)
198  // so HTMLTidy doesnt strip them out
199  if (!empty($this->_new_inline_tags)) {
200  $config['--new-inline-tags'] = implode('","',$this->_new_inline_tags);
201  }
202 
203  // Read the config variables into a string of arguments
204  $args = ' -iq -asxhtml';
205  foreach ($config as $param => $option) {
206  $args .= ' '.$param.' '.((is_bool($option)) ? (int) $option : $option);
207  }
208 
209  $doctype_header= $this->html_standards[SQ_TOOL_HTML_TIDY_HTML_STANDARD ? SQ_TOOL_HTML_TIDY_HTML_STANDARD : 'HTML_4.01_Transitional'];
210  $complete_html = $doctype_header.'
211  <html lang="en"><head><title></title>
212  <meta name="description" content="test content" />
213  </head><body>
214  '.$html.'
215  </body></html>';
216 
217  $input_file = tempnam(SQ_TEMP_PATH, 'tidy_in_');
218  $output_file = tempnam(SQ_TEMP_PATH, 'tidy_out_');
219  $error_file = tempnam(SQ_TEMP_PATH, 'tidy_err_');
220 
221  file_put_contents($input_file, $complete_html);
222  touch($output_file);
223  touch($error_file);
224 
225  $descriptorspec = array(
226  0 => Array("file", $input_file, "r"), // file that the child will read from
227  1 => Array("file", $output_file, "w"), // file that the child will write to
228  2 => Array("file", $error_file, "w"), // stderr is a file to write to
229  );
230 
231  $process = proc_open("$path_to_tidy $args ", $descriptorspec, $pipes);
232 
233  if (is_resource($process)) {
234  $return = proc_close($process);
235 
236  $tidy = file_get_contents($output_file);
237  $error = file_get_contents($error_file);
238  }
239 
240  unlink($input_file);
241  unlink($output_file);
242  unlink($error_file);
243 
244  // First ignore anything we don't want to show for whatever reason
245  $ignore_errors_list = $this->getIgnoredErrors();
246  foreach ($ignore_errors_list as $err) {
247  $error = str_replace($err, '', $error);
248  }
249 
250  // Filter out errors caused by inline tags used by WYSIWYG
251  foreach ($this->_new_inline_tags as $tag) {
252  $error = preg_replace('/line \d+ column \d+\ - Warning\: <'.$tag.'> is not approved by W3C/', '', $error);
253  }
254 
255  $this->htmltidy_errors = trim($error);
256 
257  // A return of 2 from tidy indicates there were fatal errors.
258  // (0 = no messages; 1 = only warnings, which we allow)
259  if (($return >= 2) || empty($tidy) || empty($tidy_char_set)) {
260  $this->htmltidy_status = 'fail';
261  } else {
262  // Only now we update the HTML
263  $html = $tidy;
264 
265  // HTML Tidy adds a new line character after <pre> and beofre </pre> tag
266  // which causes problem when the content is put back to WYSIWYG, replaced with a space
267  $html = preg_replace('|<pre>\n|is', '<pre>', $html);
268  $html = preg_replace('|\n</pre>|is', '</pre>', $html);
269  }
270  unset($tidy);
271  } else {
272  $this->htmltidy_status = 'wait';
273  }//end if
274  }//end else
275 
276  }//end process()
277 
278 
288  public static function check_usable()
289  {
290  if (SQ_TOOL_HTML_TIDY_ENABLED && file_exists(SQ_TOOL_HTML_TIDY_PATH)) {
291  return TRUE;
292  }
293  return translate('wysiwyg_htmltidy_status_disabled');
294 
295  }//end check_usable()
296 
297 
304  function getIgnoredErrors()
305  {
306  $ignore_list = Array();
307  $ignore_list[] = 'line 1 column 1 - Access: [3.3.1.1]: use style sheets to control presentation.';
308  $ignore_list[] = '
309 Accessibility Checks: Version 0.1
310 ';
311 
312  return $ignore_list;
313 
314  }//end getIgnoredErrors()
315 
316 
317 }//end class
318 
319 ?>