Skip to content

Commit

Permalink
fix issue #44
Browse files Browse the repository at this point in the history
  • Loading branch information
andrew2net committed Feb 18, 2021
1 parent 8b4541f commit f25eed2
Show file tree
Hide file tree
Showing 21 changed files with 409 additions and 568 deletions.
2 changes: 1 addition & 1 deletion lib/relaton_ietf/hash_converter.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ class << self
# @param item [Hash]
# @retirn [RelatonIec::IecBibliographicItem]
def bib_item(item)
IecBibliographicItem.new(item)
IetfBibliographicItem.new(item)
end
end
end
Expand Down
87 changes: 41 additions & 46 deletions lib/relaton_ietf/scrapper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,22 +11,18 @@ module RelatonIetf
# Scrapper module
module Scrapper
GH_URL = "https://raw.githubusercontent.com/relaton/relaton-data-ietf/master/data/reference."
RFC_URI_PATTERN = "https://xml2rfc.tools.ietf.org/public/rfc/bibxml"
# ID_URI_PATTERN = "https://xml2rfc.tools.ietf.org/public/rfc/bibxml-ids/reference.CODE"
BCP_URI_PATTERN = "https://www.rfc-editor.org/info/CODE"

class << self
# rubocop:disable Metrics/MethodLength

# @param text [String]
# @param is_relation [TrueClass, FalseClass]
# @return [RelatonIetf::IetfBibliographicItem]
def scrape_page(text, is_relation = false)
# Remove initial "IETF " string if specified
ref = text.gsub(/^IETF /, "")
if ref.match? /^BCP/ then bcp_item BCP_URI_PATTERN.dup, ref
else rfc_item ref, is_relation
end
/^(RFC|BCP|FYI|STD)\s(?<num>\d+)/ =~ ref
ref.sub! /(?<=^(?:RFC|BCP|FYI|STD)\s)(\d+)/, num.rjust(4, "0") if num
rfc_item ref, is_relation
rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError,
Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
Net::ProtocolError, SocketError
Expand All @@ -38,7 +34,7 @@ def scrape_page(text, is_relation = false)
# @param url [String, NilClass]
# @param ver [String, NilClass] Internet Draft version
# @return [RelatonIetf::IetfBibliographicItem]
def fetch_rfc(reference, is_relation = false, url = nil, ver = nil) # rubocop:disable Metrics/AbcSize
def fetch_rfc(reference, is_relation = false, url = nil, ver = nil) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
return unless reference

ietf_item(
Expand All @@ -50,16 +46,17 @@ def fetch_rfc(reference, is_relation = false, url = nil, ver = nil) # rubocop:di
language: [language(reference)],
link: link(reference, url, ver),
title: titles(reference),
formattedref: formattedref(reference),
abstract: abstracts(reference),
contributor: contributors(reference),
relation: relations(reference),
date: dates(reference),
series: series(reference),
place: ["Fremont, CA"],
keyword: reference.xpath("front/keyword").map(&:text),
doctype: doctype(reference[:anchor])
)
end
# rubocop:enable Metrics/MethodLength

private

Expand Down Expand Up @@ -105,35 +102,19 @@ def rfc_item(ref, is_relation)

uri = "#{GH_URL}#{ref.sub(/\s|\u00a0/, '.')}.xml"
doc = Nokogiri::XML get_page(uri)
fetch_rfc doc.at("//reference"), is_relation, uri, ver
end

# @param uri_template [String]
# @param reference [String]
# @return [RelatonIetf::IetfBibliographicItem]
def bcp_item(uri_template, reference) # rubocop:disable Metrics/MethodLength
uri = uri_template.sub "CODE", reference.sub(" ", "").downcase
doc = Nokogiri::HTML get_page(uri)
ietf_item(
id: reference,
title: [content: ""],
docid: [RelatonBib::DocumentIdentifier.new(type: "IETF", id: reference)],
language: ["en"],
link: [{ type: "src", content: uri }],
relation: fetch_relations(doc),
doctype: "rfc"
)
fetch_rfc doc.at("/referencegroup", "/reference"), is_relation, uri, ver
end

def fetch_relations(doc)
doc.xpath("//table/tr/td/a[contains(., 'RFC')]").map do |r|
RelatonBib::DocumentRelation.new(
type: "merges",
bibitem: scrape_page(r.text, true)
)
# @param reference [Nokogiri::XML::Element]
# @return [Hash]
def relations(reference)
reference.xpath("reference").map do |ref|
{ type: "includes", bibitem: fetch_rfc(ref, true) }
end
end

# @param uri [String]
# @return [String] HTTP response body
def get_page(uri)
res = Net::HTTP.get_response(URI(uri))
if res.code != "200"
Expand All @@ -143,17 +124,30 @@ def get_page(uri)
res.body
end

# @param reference [Nokogiri::XML::Element]
# @return [String]
def language(reference)
reference[:lang] || "en"
end

# @param reference [Nokogiri::XML::Element]
# @return [Array<Hash>]
def titles(reference)
title = reference.at("./front/title")
[{ content: title.text, language: language(reference), script: "Latn" }]
reference.xpath("./front/title").map do |title|
{ content: title.text, language: language(reference), script: "Latn" }
end
end

# @param reference [Nokogiri::XML::Element]
# @return [RelatonBib::FormattedRef, nil]
def formattedref(reference)
return if reference.at "./fornt/title"

cont = (reference[:anchor] || reference[:docName] || reference[:number])
RelatonBib::FormattedRef.new content: cont, language: language(reference), script: "Latn" if cont
end

# @param reference [Nokogiri::XML::Element]
# @return [Array<RelatonBib::FormattedString>]
def abstracts(ref)
ref.xpath("./front/abstract").map do |a|
Expand All @@ -164,11 +158,13 @@ def abstracts(ref)
end
end

# @param reference [Nokogiri::XML::Element]
# @return [Array<Hash>]
def contributors(reference)
persons(reference) + organizations(reference)
end

# @param reference [Nokogiri::XML::Element]
# @return [Array<Hash{Symbol=>RelatonBib::Person,Symbol=>Array<String>}>]
def persons(reference)
reference.xpath("./front/author[@surname]|./front/author[@fullname]")
Expand All @@ -182,6 +178,7 @@ def persons(reference)
end
end

# @param reference [Nokogiri::XML::Element]
# @return [Array<Hash{Symbol=>RelatonBib::Organization,Symbol=>Array<String>}>]
def organizations(reference)
publisher = { entity: new_org, role: [type: "publisher"] }
Expand All @@ -197,8 +194,8 @@ def organizations(reference)
end
end

# @param author [Nokogiri::XML::Document]
# @param ref [Nokogiri::XML::Document]
# @param author [Nokogiri::XML::Element]
# @param ref [Nokogiri::XML::Element]
# @return [RelatonBib::FullName]
def full_name(author, ref)
lang = language ref
Expand All @@ -218,7 +215,7 @@ def localized_string(content, lang)
RelatonBib::LocalizedString.new(content, lang)
end

# @param postal [Nokogiri::XML::Document]
# @param postal [Nokogiri::XML::Element]
# @return [Array<RelatonBib::Address, RelatonBib::Phone>]
def contacts(addr)
contacts = []
Expand All @@ -232,7 +229,7 @@ def contacts(addr)
contacts
end

# @param postal [Nokogiri::XML::Document]
# @param postal [Nokogiri::XML::Element]
# @rerurn [RelatonBib::Address]
def address(postal) # rubocop:disable Metrics/CyclomaticComplexity
RelatonBib::Address.new(
Expand All @@ -252,7 +249,7 @@ def add_contact(contacts, type, value)
contacts << RelatonBib::Contact.new(type: type, value: value.text)
end

# @param author [Nokogiri::XML::Document]
# @param author [Nokogiri::XML::Element]
# @return [RelatonBib::Affiliation]
def affiliation(author)
organization = author.at("./organization")
Expand Down Expand Up @@ -287,6 +284,7 @@ def month(mon)
#
# Extract date from reference.
#
# @param reference [Nokogiri::XML::Element]
# @return [Array<RelatonBib::BibliographicDate>] published data.
#
def dates(reference)
Expand All @@ -298,8 +296,6 @@ def dates(reference)
[RelatonBib::BibliographicDate.new(type: "published", on: date)]
end

# rubocop:disable Metrics/MethodLength, Metrics/AbcSize

#
# Extract document identifiers from reference
#
Expand All @@ -308,7 +304,7 @@ def dates(reference)
#
# @return [Array<RelatonBib::DocumentIdentifier>]
#
def docids(reference, ver) # rubocop:disable Metrics/MethodLength,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
def docids(reference, ver) # rubocop:disable Metrics/MethodLength,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity,Metrics/AbcSize
id = (reference[:anchor] || reference[:docName] || reference[:number])
ret = []
if id
Expand All @@ -327,11 +323,10 @@ def docids(reference, ver) # rubocop:disable Metrics/MethodLength,Metrics/Cyclom
RelatonBib::DocumentIdentifier.new(id: id, type: si[:name])
end.compact
end
# enable Metrics/MethodLength, Metrics/AbcSize

#
# Extract series form reference
# @param reference [Nokogiri::XML::Document]
# @param reference [Nokogiri::XML::Element]
#
# @return [Array<RelatonBib::Series>]
#
Expand All @@ -351,7 +346,7 @@ def series(reference)

#
# extract status
# @param reference [Nokogiri::XML::Document]
# @param reference [Nokogiri::XML::Element]
#
# @return [RelatonBib::DocumentStatus]
#
Expand Down
2 changes: 1 addition & 1 deletion lib/relaton_ietf/version.rb
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
module RelatonIetf
VERSION = "1.7.1".freeze
VERSION = "1.7.2".freeze
end
122 changes: 16 additions & 106 deletions spec/examples/bcp_47.xml
Original file line number Diff line number Diff line change
@@ -1,14 +1,22 @@
<bibdata>
<fetched>2021-02-06</fetched>
<title format="text/plain"></title>
<bibdata type="standard">
<fetched>2021-02-18</fetched>
<formattedref format="text/plain" language="en" script="Latn">BCP47</formattedref>
<uri type="xml">https://raw.githubusercontent.com/relaton/relaton-data-ietf/master/data/reference.BCP.0047.xml</uri>
<uri type="src">https://www.rfc-editor.org/info/bcp47</uri>
<docidentifier type="IETF">BCP 47</docidentifier>
<docidentifier type="IETF">BCP47</docidentifier>
<docidentifier type="rfc-anchor">BCP47</docidentifier>
<contributor>
<role type="publisher"/>
<organization>
<name>Internet Engineering Task Force</name>
<abbreviation>IETF</abbreviation>
</organization>
</contributor>
<language>en</language>
<script>Latn</script>
<relation type="merges">
<relation type="includes">
<bibitem type="standard">
<title format="text/plain" language="en" script="Latn">Matching of Language Tags</title>
<uri type="xml">https://raw.githubusercontent.com/relaton/relaton-data-ietf/master/data/reference.RFC.4647.xml</uri>
<uri type="src">https://www.rfc-editor.org/info/rfc4647</uri>
<docidentifier type="IETF">RFC 4647</docidentifier>
<docidentifier type="rfc-anchor">RFC4647</docidentifier>
Expand Down Expand Up @@ -65,52 +73,9 @@
<place>Fremont, CA</place>
</bibitem>
</relation>
<relation type="merges">
<bibitem type="standard">
<title format="text/plain" language="en" script="Latn">Tags for the Identification of Languages</title>
<uri type="xml">https://raw.githubusercontent.com/relaton/relaton-data-ietf/master/data/reference.RFC.3066.xml</uri>
<uri type="src">https://www.rfc-editor.org/info/rfc3066</uri>
<docidentifier type="IETF">RFC 3066</docidentifier>
<docidentifier type="rfc-anchor">RFC3066</docidentifier>
<docidentifier type="DOI">10.17487/RFC3066</docidentifier>
<date type="published">
<on>2001-01</on>
</date>
<contributor>
<role type="author"/>
<person>
<name>
<completename language="en">H. Alvestrand</completename>
</name>
<affiliation>
<organization>
<name>Internet Engineering Task Force</name>
<abbreviation>IETF</abbreviation>
</organization>
</affiliation>
</person>
</contributor>
<contributor>
<role type="publisher"/>
<organization>
<name>Internet Engineering Task Force</name>
<abbreviation>IETF</abbreviation>
</organization>
</contributor>
<language>en</language>
<script>Latn</script>
<abstract format="text/plain" language="en" script="Latn">This document describes a language tag for use in cases where it is desired to indicate the language used in an information object, how to register values for use in this language tag, and a construct for matching such language tags. This document specifies an Internet Best Current Practices for the Internet Community, and requests discussion and suggestions for improvements.</abstract>
<series type="main">
<title format="text/plain" language="en" script="Latn">RFC</title>
<number>3066</number>
</series>
<place>Fremont, CA</place>
</bibitem>
</relation>
<relation type="merges">
<relation type="includes">
<bibitem type="standard">
<title format="text/plain" language="en" script="Latn">Tags for Identifying Languages</title>
<uri type="xml">https://raw.githubusercontent.com/relaton/relaton-data-ietf/master/data/reference.RFC.5646.xml</uri>
<uri type="src">https://www.rfc-editor.org/info/rfc5646</uri>
<docidentifier type="IETF">RFC 5646</docidentifier>
<docidentifier type="rfc-anchor">RFC5646</docidentifier>
Expand Down Expand Up @@ -167,62 +132,7 @@
<place>Fremont, CA</place>
</bibitem>
</relation>
<relation type="merges">
<bibitem type="standard">
<title format="text/plain" language="en" script="Latn">Tags for Identifying Languages</title>
<uri type="xml">https://raw.githubusercontent.com/relaton/relaton-data-ietf/master/data/reference.RFC.4646.xml</uri>
<uri type="src">https://www.rfc-editor.org/info/rfc4646</uri>
<docidentifier type="IETF">RFC 4646</docidentifier>
<docidentifier type="rfc-anchor">RFC4646</docidentifier>
<docidentifier type="DOI">10.17487/RFC4646</docidentifier>
<date type="published">
<on>2006-09</on>
</date>
<contributor>
<role type="author"/>
<person>
<name>
<completename language="en">A. Phillips</completename>
</name>
<affiliation>
<organization>
<name>Internet Engineering Task Force</name>
<abbreviation>IETF</abbreviation>
</organization>
</affiliation>
</person>
</contributor>
<contributor>
<role type="author"/>
<person>
<name>
<completename language="en">M. Davis</completename>
</name>
<affiliation>
<organization>
<name>Internet Engineering Task Force</name>
<abbreviation>IETF</abbreviation>
</organization>
</affiliation>
</person>
</contributor>
<contributor>
<role type="publisher"/>
<organization>
<name>Internet Engineering Task Force</name>
<abbreviation>IETF</abbreviation>
</organization>
</contributor>
<language>en</language>
<script>Latn</script>
<abstract format="text/plain" language="en" script="Latn">This document describes the structure, content, construction, and semantics of language tags for use in cases where it is desirable to indicate the language used in an information object. It also describes how to register values for use in language tags and the creation of user-defined extensions for private interchange. This document, in combination with RFC 4647, replaces RFC 3066, which replaced RFC 1766. This document specifies an Internet Best Current Practices for the Internet Community, and requests discussion and suggestions for improvements.</abstract>
<series type="main">
<title format="text/plain" language="en" script="Latn">RFC</title>
<number>4646</number>
</series>
<place>Fremont, CA</place>
</bibitem>
</relation>
<place>Fremont, CA</place>
<ext>
<doctype>rfc</doctype>
</ext>
Expand Down
Loading

0 comments on commit f25eed2

Please sign in to comment.