2011-01-26

Reverse-engineered RelaxNG schema for GAW XML Catalogue

GAW (Global Atmospheric Watch) has its legacy catalogue system. I called it
legacy, but I don't mean it is bad or inferior. Rather I'm interested in
the fact that there already exists an operating network. Why the WIS don't
make use of it to achieve efficient implementation of catalogue? So I've
got an XML sample data from WDCGG and analysed its structure.
grammar {
start =
element wdcgg {
element station {
element station_name { xsd:token { maxLength = "64" } },
element id {
xsd:Name { length = "9" pattern = "\s*\w\w\w\d\d\d\w\d\d\s*" }
},
element latitude {
xsd:float { maxInclusive = "90" minInclusive = "-90" }
},
element longitude {
xsd:float { maxInclusive = "180" minInclusive = "-180" }
},
element altitude {
xsd:float { maxInclusive = "5079" minInclusive = "0" }
},
element wmo_region { WMORegion },
element gaw_category { GawCategory },
element country { xsd:token },
element organization { xsd:token },
element description { xsd:string },
element parameter {
element parameter_name {
xsd:NMTOKEN { maxLength = "7" pattern = "\s*[0-9A-Za-z]+\s*" }
},
element status_of_report { xsd:string },
element start_of_data { DateOrEmpty },
element end_of_data { DateOrEmpty },
element last_update { DateOrEmpty },
element contributor {
element organization { xsd:token },
element country { xsd:token }
}+,
element contact_person {
element name { xsd:token },
element organization { xsd:token },
element phone { xsd:token },
element fax { xsd:token },
element email { xsd:token }
}*
}*
}*
}
DateOrEmpty =
empty
| xsd:date { pattern = "\s*[0-9]{4}-[0-9]{2}-[0-9]{2}\s*" }
WMORegion =
"REGION I (Africa)"
| "REGION II (Asia)"
| "REGION III (South America)"
| "REGION IV (North and Central America)"
| "REGION V (South-West Pacific)"
| "REGION VI (Europe)"
| "ANTARCTICA"
GawCategory =
empty
| "Regional"
| "Global"
| "Contributing"
| "Non-GAW (international)"
}

2010-10-30

Forward proxy of HTTPS by Apache HTTPD

I wanted to set up an forward proxy server for a web site that uses
both HTTPS and HTTP protocols.
Okay it is well documented in
<http://httpd.apache.org/docs/2.2/en/mod/mod_proxy.html>.

One thing I had to find by trial and error is that https connection is
implemented by CONNECT method of HTTP and is not represented directly
in <Proxy> directive.
Instead we have to match URI in form "proxy:host:443".

# for HTTP
<Proxy http://{ORIGIN-SERVER}/*>
Order deny,allow
Allow from {CLIENT-IP}
</Proxy>

# for HTTPS
<Proxy proxy:{ORIGIN-SERVER}:443>
Order deny,allow
Allow from {CLIENT-IP}
</Proxy>

2010-10-21

ctags for XML Schema

require 'uri'
require 'rubygems'
require 'xml'

class App

def wputs str
$stderr.puts str if $VERBOSE
end

def eputs str
$stderr.puts str
end

def assert_equal test, right
raise "#{test} != #{right}" unless test == right
end

def getopts
while /^-/ === @argv.first
case opt = @argv.shift
when /^-o(.*)/ then @outfnam = $1
end
end
end

def initialize argv
@argv = argv.dup
@cache = {}
@names = {}
@outfnam = 'tags'
getopts
end

def help
puts <
vi tags generator
usage: ruby #$0 file.xsd ...
EOF
end

XSD_NS = 'http://www.w3.org/2001/XMLSchema'

def get1 uri, disp = nil
if @cache[uri.to_s] then
wputs "skipping #{disp or uri}"
return
end
doc = XML::Document.file(uri.path)
@cache[uri.to_s] = true
children = []
assert_equal(doc.root.namespaces.namespace.href, XSD_NS)
nodes = doc.find('/xs:schema/xs:import|/xs:schema/xs:include', 'xs'=>XSD_NS)
nodes.each { |node|
children.push node['schemaLocation'].to_s
}
nodes = nil
nodes = doc.find('/xs:schema/xs:*/@name', 'xs'=>XSD_NS)
nodes.each { |node|
name = node.value
if @names[name]
eputs "duplicated #{name} in #{uri.path} and #{@names[name]}"
end
@names[name] = uri.path
}
nodes = nil
doc = nil
for child in children
get1(uri + child, child)
end
end

def run1 filename
uri = URI('file:///' + filename)
get1(uri)
end

def run
for filename in @argv
run1 filename
end
self
end

def output
File.open(@outfnam, 'w') { |fp|
now = Time.now.utc.strftime('%Y-%m-%dT%H:%M:%SZ')
fp.puts "!_TAG_FILE_SORTED\t1\tsort=case-sensitive date=#{now}"
for name in @names.keys.sort
query = '/\["\']' + name.gsub(/\W/, '.') + '\["\']/'
fp.puts [name, @names[name], query].join("\t")
end
}
eputs "saved to #{@outfnam}"
end

def close
output
self
end

end

App.new(ARGV).run.close
When editing XMLs in vi, it is really powerful.
====
require 'uri'
require 'rubygems'
require 'xml'

class App

def wputs str
$stderr.puts str if $VERBOSE
end

def eputs str
$stderr.puts str
end

def assert_equal test, right
raise "#{test} != #{right}" unless test == right
end

def getopts
while /^-/ === @argv.first
case opt = @argv.shift
when /^-o(.*)/ then @outfnam = $1
end
end
end

def initialize argv
@argv = argv.dup
@cache = {}
@names = {}
@outfnam = 'tags'
getopts
end

def help
puts <
vi tags generator
usage: ruby #$0 file.xsd ...
EOF
end

XSD_NS = 'http://www.w3.org/2001/XMLSchema'

def get1 uri, disp = nil
if @cache[uri.to_s] then
wputs "skipping #{disp or uri}"
return
end
doc = XML::Document.file(uri.path)
@cache[uri.to_s] = true
children = []
assert_equal(doc.root.namespaces.namespace.href, XSD_NS)
nodes = doc.find('/xs:schema/xs:import|/xs:schema/xs:include', 'xs'=>XSD_NS)
nodes.each { |node|
children.push node['schemaLocation'].to_s
}
nodes = nil
nodes = doc.find('/xs:schema/xs:*/@name', 'xs'=>XSD_NS)
nodes.each { |node|
name = node.value
if @names[name]
eputs "duplicated #{name} in #{uri.path} and #{@names[name]}"
end
@names[name] = uri.path
}
nodes = nil
doc = nil
for child in children
get1(uri + child, child)
end
end

def run1 filename
uri = URI('file:///' + filename)
get1(uri)
end

def run
for filename in @argv
run1 filename
end
self
end

def output
File.open(@outfnam, 'w') { |fp|
now = Time.now.utc.strftime('%Y-%m-%dT%H:%M:%SZ')
fp.puts "!_TAG_FILE_SORTED\t1\tsort=case-sensitive date=#{now}"
for name in @names.keys.sort
query = '/\["\']' + name.gsub(/\W/, '.') + '\["\']/'
fp.puts [name, @names[name], query].join("\t")
end
}
eputs "saved to #{@outfnam}"
end

def close
output
self
end

end

App.new(ARGV).run.close

Caveat - xlink:href in ISO 19139 Geographic Metadata

I wanted to use the same instances of gmd:authority/gmd:CI_Citation
many times, so I came up with using xlink:href.

[definition]
<gmd:authority>
<gmd:CI_Citation id="url.authority">
<gmd:title>...
</gmd:CI_Citation>
</gmd:authority>

[quotation]
<gmd:authority>
<gmd:CI_Citation xlink:href="#url.authority">
</gmd:authority>

But it causes validation error, saying xlink:href is not allowed in CI_Citation.
It took a while, but finally I got the reason reading XML schema.

The xlink:href attribute must be attached to parent element of omitted
element that would be if xlink:href is not used.
So following validates.

<gmd:authority>
<gmd:CI_Citation id="url.authority">
<gmd:title>...

<gmd:authority xlink:href="#url.authority"/>

2010-10-20

A libxml-ruby script to download XSD recursively

require 'uri'
require 'net/http'
require 'rubygems'
require 'xml'

class App

def initialize argv
@argv = argv
@cache = {}
@htconn = {}
end

def help
puts <<EOF
XSD downloader following includes and imports
usage: ruby #$0 [-pNUM] uri ...
-pNUM number of directory structure (including hostname) to be stripped
EOF
end

def outfnam uri
File.join(*[uri.host, uri.path].compact)
end

def close
@htconn[:conn].finish if @htconn[:conn]
end

def getconn uri
shp = [uri.scheme, uri.host, uri.port]
if @htconn[:shp] == shp then
puts 'reusing connection'
yield @htconn[:conn]
else
@htconn[:conn].finish if @htconn[:conn]
puts "connecting #{shp.join(' ')}"
@htconn[:shp] = shp
@htconn[:conn] = Net::HTTP.new(uri.host, uri.port)
@htconn[:conn].start
yield @htconn[:conn]
end
end

def assert_equal test, right
raise "#{test} != #{right}" unless test == right
end

XSD_NS = 'http://www.w3.org/2001/XMLSchema'

def mkdir_p dirname
return nil if File.directory?(dirname)
raise Errno::ENOTDIR, "not a directory: (#{dirname})" if
File.exist?(dirname)
puts "mkdir #{dirname}"
mkdir_p(File.dirname(dirname))
Dir.mkdir(dirname)
end

def savefile filename, content
puts "saving #{filename}"
mkdir_p File.dirname(filename)
File.open(filename, 'wb') { |fp| fp.write(content) }
end

def get1 uri, disp = nil
if @cache[uri.to_s] then
puts "skipping #{disp or uri}"
return
end
ofn = outfnam(uri)
buf = nil
getconn(uri) {|conn|
resp = conn.get(uri.path)
raise "#{resp.code} #{resp.message}" unless /^200/ === resp.code
buf = resp.body
}
savefile(ofn, buf)
@cache[uri.to_s] = true
doc = XML::Document.string(buf)
assert_equal(doc.root.namespaces.namespace.href, XSD_NS)
nodes = doc.find('/xs:schema/xs:import|/xs:schema/xs:include', 'xs'=>XSD_NS)
nodes.each { |node|
child = node['schemaLocation'].to_s
get1(uri + child, child)
}
nodes = nil
end

def run1 arg
uri = URI(arg)
get1(uri)
end

def run
for arg in @argv
run1 arg
end
self
end

end

App.new(ARGV).run.close

2010-10-13

How to create processing instruction when writing XML using libxml-ruby

Actually I didn't find the way. So I had to use libxslt-ruby and
apply a stylesheet simply inserts PI.

require 'libxslt'
if @xslt
filter = <<-END_OF_XSLT
<?xml version="1.0" encoding="utf-8"?>
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:template match="/">
<xsl:processing-instruction name="xml-stylesheet"
>type="text/xsl" href="#{@xslt}"</xsl:processing-instruction>
<xsl:copy-of select="."/>
</xsl:template>
</xsl:stylesheet>
END_OF_XSLT
stylesheet = LibXSLT::XSLT::Stylesheet.new(XML::Document.string(filter))
@xdoc = stylesheet.apply(@xdoc)
end