class Sanitize

Constants

REGEX_DATA_ATTR

Matches a valid HTML5 data attribute name. The unicode ranges included here are a conservative subset of the full range of characters that are technically allowed, with the intent of matching the most common characters used in data attribute names while excluding uncommon or potentially misleading characters, or characters with the potential to be normalized into unsafe or confusing forms.

If you need data attr names with characters that aren't included here (such as combining marks, full-width characters, or CJK), please consider creating a custom transformer to validate attributes according to your needs.

www.whatwg.org/specs/web-apps/current-work/multipage/elements.html#embedding-custom-non-visible-data-with-the-data-*-attributes

REGEX_PROTOCOL

Matches an attribute value that could be treated by a browser as a URL with a protocol prefix, such as “http:” or “javascript:”. Any string of zero or more characters followed by a colon is considered a match, even if the colon is encoded as an entity and even if it's an incomplete entity (which IE6 and Opera will still parse).

VERSION

Attributes

config[R]

Public Class Methods

clean(html, config = {}) click to toggle source

Returns a sanitized copy of html, using the settings in config if specified.

# File lib/sanitize.rb, line 65
def self.clean(html, config = {})
  Sanitize.new(config).clean(html)
end
clean!(html, config = {}) click to toggle source

Performs #clean in place, returning html, or nil if no changes were made.

# File lib/sanitize.rb, line 71
def self.clean!(html, config = {})
  Sanitize.new(config).clean!(html)
end
clean_document(html, config = {}) click to toggle source

Performs a #clean using a full-document HTML parser instead of the default fragment parser. This will add a DOCTYPE and html tag unless they are already present

# File lib/sanitize.rb, line 78
def self.clean_document(html, config = {})
  Sanitize.new(config).clean_document(html)
end
clean_document!(html, config = {}) click to toggle source

Performs #clean_document in place, returning html, or nil if no changes were made.

# File lib/sanitize.rb, line 84
def self.clean_document!(html, config = {})
  Sanitize.new(config).clean_document!(html)
end
clean_node!(node, config = {}) click to toggle source

Sanitizes the specified Nokogiri::XML::Node and all its children.

# File lib/sanitize.rb, line 89
def self.clean_node!(node, config = {})
  Sanitize.new(config).clean_node!(node)
end
new(config = {}) click to toggle source

Returns a new Sanitize object initialized with the settings in config.

# File lib/sanitize.rb, line 98
def initialize(config = {})
  @config = Config::DEFAULT.merge(config)

  @transformers = {
    :breadth => Array(@config[:transformers_breadth].dup),
    :depth   => Array(@config[:transformers]) + Array(@config[:transformers_depth])
  }

  # Default depth transformers. These always run at the end of the chain,
  # after any custom transformers.
  @transformers[:depth] << Transformers::CleanComment unless @config[:allow_comments]

  @transformers[:depth] <<
      Transformers::CleanCDATA <<
      Transformers::CleanElement.new(@config)
end

Public Instance Methods

clean(html) click to toggle source

Returns a sanitized copy of the given html fragment.

# File lib/sanitize.rb, line 116
def clean(html)
  if html
    dupe = html.dup
    clean!(dupe) || dupe
  end
end
clean!(html, parser = Nokogiri::HTML::DocumentFragment) click to toggle source

Performs clean in place, returning html, or nil if no changes were made.

# File lib/sanitize.rb, line 125
def clean!(html, parser = Nokogiri::HTML::DocumentFragment)
  fragment = parser.parse(html)
  clean_node!(fragment)

  output_method_params = {:encoding => @config[:output_encoding], :indent => 0}

  if @config[:output] == :xhtml
    output_method = fragment.method(:to_xhtml)
    output_method_params[:save_with] = Nokogiri::XML::Node::SaveOptions::AS_XHTML
  elsif @config[:output] == :html
    output_method = fragment.method(:to_html)
  else
    raise Error, "unsupported output format: #{@config[:output]}"
  end

  result = output_method.call(output_method_params)

  return result == html ? nil : html[0, html.length] = result
end
clean_document(html) click to toggle source

Returns a sanitized copy of the given full html document.

# File lib/sanitize.rb, line 146
def clean_document(html)
  unless html.nil?
    clean_document!(html.dup) || html
  end
end
clean_document!(html) click to toggle source

Performs ::clean_document in place, returning html, or nil if no changes were made.

# File lib/sanitize.rb, line 154
def clean_document!(html)
  if !@config[:elements].include?('html') && !@config[:remove_contents]
    raise 'You must have the HTML element whitelisted to call #clean_document unless remove_contents is set to true'
    # otherwise Nokogiri will raise for having multiple root nodes when
    # it moves its children to the root document context
  end

  clean!(html, Nokogiri::HTML::Document)
end
clean_node!(node) click to toggle source

Sanitizes the specified Nokogiri::XML::Node and all its children.

# File lib/sanitize.rb, line 165
def clean_node!(node)
  raise ArgumentError unless node.is_a?(Nokogiri::XML::Node)

  node_whitelist = Set.new

  unless @transformers[:breadth].empty?
    traverse_breadth(node) {|n| transform_node!(n, node_whitelist, :breadth) }
  end

  traverse_depth(node) {|n| transform_node!(n, node_whitelist, :depth) }
  node
end

Private Instance Methods

transform_node!(node, node_whitelist, mode) click to toggle source
# File lib/sanitize.rb, line 180
def transform_node!(node, node_whitelist, mode)
  @transformers[mode].each do |transformer|
    result = transformer.call({
      :config         => @config,
      :is_whitelisted => node_whitelist.include?(node),
      :node           => node,
      :node_name      => node.name.downcase,
      :node_whitelist => node_whitelist,
      :traversal_mode => mode
    })

    if result.is_a?(Hash) && result[:node_whitelist].respond_to?(:each)
      node_whitelist.merge(result[:node_whitelist])
    end
  end

  node
end
traverse_breadth(node, &block) click to toggle source

Performs breadth-first traversal, operating first on the root node, then traversing downwards.

# File lib/sanitize.rb, line 201
def traverse_breadth(node, &block)
  block.call(node)
  node.children.each {|child| traverse_breadth(child, &block) }
end
traverse_depth(node, &block) click to toggle source

Performs depth-first traversal, operating first on the deepest nodes in the document, then traversing upwards to the root.

# File lib/sanitize.rb, line 208
def traverse_depth(node, &block)
  node.children.each {|child| traverse_depth(child, &block) }
  block.call(node)
end