This file is part of TexieR - universal text to html converter.
rane <rane@metatribe.org>
Original version:
Copyright (c) 2004-2006 David Grudl
Ruby port:
Copyright (c) 2006 rane
Texier is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License version 2 as published by the Free Software Foundation.
Texier is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
0.1 ($Revision: 25 $ $Date: 2006-10-24 21:52:06 +0200 (Ut, 24 okt 2006) $)
| HASH | = | "\x15-\x1F" | hashing meta-charakters | |
| HASH_SPACES | = | "\x15-\x18" | ||
| HASH_NC | = | "\x19\x1B-\x1F" | ||
| HASH_WC | = | "\x1A-\x1F" | ||
| PATTERN_LINK_REF | = | /\[[^\[\]\*\n#{HASH}]+?\]/ | links | |
| PATTERN_LINK_IMAGE | = | /\[\*[^\n#{HASH}]+\*\]/ | ||
| PATTERN_LINK_URL | = | /(?:\[[^\]\n]+?\]|(?!\[)[^\s#{HASH}]*[^:\);,\.!\?\s#{HASH}])/ | ||
| PATTERN_LINK | = | /(?::(#{PATTERN_LINK_URL}))/ | ||
| PATTERN_LINK_N | = | /(?::(#{PATTERN_LINK_URL}|:))/ | ||
| PATTERN_EMAIL | = | /[a-z0-9.+_-]+@[a-z0-9.+_-]{2,}\.[a-z]{2,}/ | ||
| PATTERN_MODIFIER | = | /(?:\ *(?:\ \.|^\.)(\([^\n\)]+\)|\[[^\n\]]+\]|\{[^\n\}]+\})(\([^\n\)]+\)|\[[^\n\]]+\]|\{[^\n\}]+\})??(\([^\n\)]+\)|\[[^\n\]]+\]|\{[^\n\}]+\})??)/ | modifier .(title)[class]{style} | |
| PATTERN_MODIFIER_H | = | /(?:\ *(?:\ \.|^\.)(\([^\n\)]+\)|\[[^\n\]]+\]|\{[^\n\}]+\}|(?:<>|>|=|<))(\([^\n\)]+\)|\[[^\n\]]+\]|\{[^\n\}]+\}|(?:<>|>|=|<))??(\([^\n\)]+\)|\[[^\n\]]+\]|\{[^\n\}]+\}|(?:<>|>|=|<))??(\([^\n\)]+\)|\[[^\n\]]+\]|\{[^\n\}]+\}|(?:<>|>|=|<))??)/ | modifier .(title)[class]{style}<> | |
| PATTERN_MODIFIER_HV | = | /(?:\ *(?:\ \.|^\.)(\([^\n\)]+\)|\[[^\n\]]+\]|\{[^\n\}]+\}|(?:<>|>|=|<)|(?:\^|\-|\_))(\([^\n\)]+\)|\[[^\n\]]+\]|\{[^\n\}]+\}|(?:<>|>|=|<)|(?:\^|\-|\_))??(\([^\n\)]+\)|\[[^\n\]]+\]|\{[^\n\}]+\}|(?:<>|>|=|<)|(?:\^|\-|\_))??(\([^\n\)]+\)|\[[^\n\]]+\]|\{[^\n\}]+\}|(?:<>|>|=|<)|(?:\^|\-|\_))??(\([^\n\)]+\)|\[[^\n\]]+\]|\{[^\n\}]+\}|(?:<>|>|=|<)|(?:\^|\-|\_))??)/ | modifier .(title)[class]{style}<>^ | |
| PATTERN_IMAGE | = | /\[\*([^\n#{HASH}]+?)#{PATTERN_MODIFIER}?\ *(\*|>|<)\]/ | images [* urls .(title)[class]{style} >] |
| allowed_classes | [RW] | Allowed classes |
| allowed_styles | [RW] | Allowed inline CSS styles |
| allowed_tags | [RW] | Allowed HTML tags |
| block_module | [R] | Default modules |
| block_patterns | [R] |
Registered regexps and associated handlers for block parsing.
Format:
{:handler => proc,
:pattern => regular expression}
|
| definition_list_module | [R] | Default modules |
| dom | [R] | DOM structure for parsed text |
| formatter_module | [R] | Default modules |
| generic_block_module | [R] | Default modules |
| heading_module | [R] | Default modules |
| horiz_line_module | [R] | Default modules |
| html_module | [R] | Default modules |
| image_desc_module | [R] | Default modules |
| image_module | [R] | Default modules |
| line_patterns | [R] |
Registered regexps and associated handlers for inline parsing.
Format:
{:handler => proc,
:pattern => regular expression}
|
| link_module | [R] | Default modules |
| list_module | [R] | Default modules |
| merge_lines | [RW] | Merge lines mode |
| modules | [R] | List of all used modules |
| obfuscate_email | [W] | Do obfuscate e-mail addresses? |
| phrase_module | [R] | Default modules |
| quick_correct_module | [R] | Default modules |
| quote_module | [R] | Default modules |
| reference_handler | [RW] | |
| smilies_module | [R] | Default modules |
| summary | [RW] | Parsing summary |
| tab_width | [RW] | TAB width (for converting tabs to spaces) |
| table_module | [R] | Default modules |
Translate all white spaces (\t \n \r space) to meta-spaces \x15-\x18 which are ignored by some formatting functions
# File lib/texy.rb, line 288 def self.freeze_spaces(string) string.tr(" \t\r\n", "\x15\x16\x17\x18") end
# File lib/texy.rb, line 104 def initialize self.tab_width = 8 self.allowed_classes = :all self.allowed_styles = :all self.allowed_tags = Texy::Html::VALID # full support for HTML tags self.obfuscate_email = true self.summary = { :images => [], :links => [], :preload => [] } self.merge_lines = true @line_patterns = [] @block_patterns = [] @references = {} # load all modules load_modules end
Revert meta-spaces back to normal spaces
# File lib/texy.rb, line 293 def self.unfreeze_spaces(string) string.tr("\x15\x16\x17\x18", " \t\r\n") end
Add new named reference
# File lib/texy.rb, line 309 def add_reference(name, obj) name.downcase! # watch out for utf8! @references[name] = obj end
Receive new named link. If not exists, try call user function to create one.
# File lib/texy.rb, line 315 def reference(name) low_name = name.downcase # watch out for UTF8 ! return @references[low_name] if @references[low_name] return reference_handler.call(name, self) if reference_handler false end
# File lib/texy.rb, line 234 def register_block_pattern(handler, pattern) # raise ArgumentError, 'Not a block pattern: ' + pattern.source unless /(.)\^.*\$\\1[a-z]*/i =~ pattern @block_patterns << { :handler => handler, :pattern => pattern } end
# File lib/texy.rb, line 220 def register_line_pattern(handler, pattern) @line_patterns << { :handler => handler, :pattern => pattern } end
Switch Texy and default modules to safe mode
Suitable for "comments" and other usages, where attacker may insert input text.
# File lib/texy.rb, line 265 def safe_mode self.allowed_classes = false # no class or ID are allowed self.allowed_styles = false # style modifiers are disabled html_module.safe_mode # only HTML tags and attributes specified in $safeTags array are allowed image_module.allowed = false # disable images link_module.force_no_follow = true # force rel="nofollow" end
Switch Texy and default modules to (default) trust mode
# File lib/texy.rb, line 276 def trust_mode self.allowed_classes = :all # classes and id are allowed self.allowed_styles = :all # inline styles are allowed html_module.trust_mode # full support for HTML tags image_module.allowed = true # enable images link_module.force_no_follow = true # disable automatic rel="nofollow" end
Initialization
It is called between constructor and first use (method parse).
# File lib/texy.rb, line 248 def init @cache = [] @line_patterns = [] @block_patterns = [] raise RuntimeError, 'Texy: No modules installed' if modules.empty? # init modules modules.map &:init end
Create array of all used modules.
This array can be changed by overriding this method (by subclasses) or directly in main code.
# File lib/texy.rb, line 184 def load_modules # Line parsing - order does not matter @script_module = Modules::Script.new(self) @html_module = Modules::Html.new(self) @image_module = Modules::Image.new(self) @link_module = Modules::Link.new(self) @phrase_module = Modules::Phrase.new(self) @smilies_module = Modules::Smilies.new(self) # Block parsing - order does not matter @block_module = Modules::Block.new(self) @heading_module = Modules::Heading.new(self) @horiz_line_module = Modules::HorizLine.new(self) @quote_module = Modules::Quote.new(self) @list_module = Modules::List.new(self) @definition_list_module = Modules::DefinitionList.new(self) @table_module = Modules::Table.new(self) @image_desc_module = Modules::ImageDesc.new(self) @generic_block_module = Modules::GenericBlock.new(self) # post process @quick_correct_module = Modules::QuickCorrect.new(self) # @long_words_module = Modules::LongWords.new(self) @formatter_module = Modules::Formatter.new(self) end