class Sanitize

Constants

REGEX_PROTOCOL

Matches an attribute value that could be treated by a browser as a URL with a protocol prefix, such as “http:” or “javascript:”. Any string of zero or more characters followed by a colon is considered a match, even if the colon is encoded as an entity and even if it's an incomplete entity (which IE6 and Opera will still parse).

VERSION

Attributes

config[R]

Public Class Methods

clean(html, config = {}) click to toggle source

Returns a sanitized copy of html, using the settings in config if specified.

# File lib/sanitize.rb, line 51
def self.clean(html, config = {})
  Sanitize.new(config).clean(html)
end
clean!(html, config = {}) click to toggle source

Performs #clean in place, returning html, or nil if no changes were made.

# File lib/sanitize.rb, line 57
def self.clean!(html, config = {})
  Sanitize.new(config).clean!(html)
end
clean_document(html, config = {}) click to toggle source

Performs a #clean using a full-document HTML parser instead of the default fragment parser. This will add a DOCTYPE and html tag unless they are already present

# File lib/sanitize.rb, line 64
def self.clean_document(html, config = {})
  Sanitize.new(config).clean_document(html)
end
clean_document!(html, config = {}) click to toggle source

Performs #clean_document in place, returning html, or nil if no changes were made.

# File lib/sanitize.rb, line 70
def self.clean_document!(html, config = {})
  Sanitize.new(config).clean_document!(html)
end
clean_node!(node, config = {}) click to toggle source

Sanitizes the specified Nokogiri::XML::Node and all its children.

# File lib/sanitize.rb, line 75
def self.clean_node!(node, config = {})
  Sanitize.new(config).clean_node!(node)
end
new(config = {}) click to toggle source

Returns a new Sanitize object initialized with the settings in config.

# File lib/sanitize.rb, line 84
def initialize(config = {})
  @config = Config::DEFAULT.merge(config)

  @transformers = {
    :breadth => Array(@config[:transformers_breadth].dup),
    :depth   => Array(@config[:transformers]) + Array(@config[:transformers_depth])
  }

  # Default depth transformers. These always run at the end of the chain,
  # after any custom transformers.
  @transformers[:depth] << Transformers::CleanComment unless @config[:allow_comments]

  @transformers[:depth] <<
      Transformers::CleanCDATA <<
      Transformers::CleanElement.new(@config)
end

Public Instance Methods

clean(html) click to toggle source

Returns a sanitized copy of html.

# File lib/sanitize.rb, line 102
def clean(html)
  if html
    dupe = html.dup
    clean!(dupe) || dupe
  end
end
clean!(html, parser = Nokogiri::HTML::DocumentFragment) click to toggle source

Performs clean in place, returning html, or nil if no changes were made.

# File lib/sanitize.rb, line 111
def clean!(html, parser = Nokogiri::HTML::DocumentFragment)
  fragment = parser.parse(html)
  clean_node!(fragment)

  output_method_params = {:encoding => @config[:output_encoding], :indent => 0}

  if @config[:output] == :xhtml
    output_method = fragment.method(:to_xhtml)
    output_method_params[:save_with] = Nokogiri::XML::Node::SaveOptions::AS_XHTML
  elsif @config[:output] == :html
    output_method = fragment.method(:to_html)
  else
    raise Error, "unsupported output format: #{@config[:output]}"
  end

  result = output_method.call(output_method_params)

  return result == html ? nil : html[0, html.length] = result
end
clean_document(html) click to toggle source
# File lib/sanitize.rb, line 131
def clean_document(html)
  unless html.nil?
    clean_document!(html.dup) || html
  end
end
clean_document!(html) click to toggle source
# File lib/sanitize.rb, line 137
def clean_document!(html)
  if !@config[:elements].include?('html') && !@config[:remove_contents]
    raise 'You must have the HTML element whitelisted to call #clean_document unless remove_contents is set to true'
    # otherwise Nokogiri will raise for having multiple root nodes when
    # it moves its children to the root document context
  end

  clean!(html, Nokogiri::HTML::Document)
end
clean_node!(node) click to toggle source

Sanitizes the specified Nokogiri::XML::Node and all its children.

# File lib/sanitize.rb, line 148
def clean_node!(node)
  raise ArgumentError unless node.is_a?(Nokogiri::XML::Node)

  node_whitelist = Set.new

  unless @transformers[:breadth].empty?
    traverse_breadth(node) {|n| transform_node!(n, node_whitelist, :breadth) }
  end

  traverse_depth(node) {|n| transform_node!(n, node_whitelist, :depth) }
  node
end

Private Instance Methods

transform_node!(node, node_whitelist, mode) click to toggle source
# File lib/sanitize.rb, line 163
def transform_node!(node, node_whitelist, mode)
  @transformers[mode].each do |transformer|
    result = transformer.call({
      :config         => @config,
      :is_whitelisted => node_whitelist.include?(node),
      :node           => node,
      :node_name      => node.name.downcase,
      :node_whitelist => node_whitelist,
      :traversal_mode => mode
    })

    if result.is_a?(Hash) && result[:node_whitelist].respond_to?(:each)
      node_whitelist.merge(result[:node_whitelist])
    end
  end

  node
end
traverse_breadth(node, &block) click to toggle source

Performs breadth-first traversal, operating first on the root node, then traversing downwards.

# File lib/sanitize.rb, line 184
def traverse_breadth(node, &block)
  block.call(node)
  node.children.each {|child| traverse_breadth(child, &block) }
end
traverse_depth(node, &block) click to toggle source

Performs depth-first traversal, operating first on the deepest nodes in the document, then traversing upwards to the root.

# File lib/sanitize.rb, line 191
def traverse_depth(node, &block)
  node.children.each {|child| traverse_depth(child, &block) }
  block.call(node)
end