From 72a86c19764d0f45741ae004d7618db62366969b Mon Sep 17 00:00:00 2001 From: Ronald Tse Date: Wed, 15 Nov 2023 16:28:38 +0800 Subject: [PATCH] feat: initial commit --- .github/workflows/main.yml | 27 +++++++++ .gitignore | 11 ++++ .rspec | 3 + CODE_OF_CONDUCT.md | 84 ++++++++++++++++++++++++++ Gemfile | 5 ++ Rakefile | 12 ++++ bin/console | 11 ++++ bin/setup | 8 +++ exe/termium | 64 ++++++++++++++++++++ lib/termium.rb | 26 ++++++++ lib/termium/abbreviation.rb | 58 ++++++++++++++++++ lib/termium/core.rb | 52 ++++++++++++++++ lib/termium/designation_operations.rb | 31 ++++++++++ lib/termium/entry_term.rb | 86 +++++++++++++++++++++++++++ lib/termium/extract.rb | 27 +++++++++ lib/termium/extract_language.rb | 12 ++++ lib/termium/language_module.rb | 80 +++++++++++++++++++++++++ lib/termium/parameter.rb | 12 ++++ lib/termium/source.rb | 34 +++++++++++ lib/termium/source_ref.rb | 9 +++ lib/termium/subject.rb | 14 +++++ lib/termium/textual_support.rb | 83 ++++++++++++++++++++++++++ lib/termium/universal_entry.rb | 27 +++++++++ lib/termium/version.rb | 5 ++ sig/termium.rbs | 4 ++ spec/spec_helper.rb | 15 +++++ spec/termium_spec.rb | 11 ++++ termium.gemspec | 40 +++++++++++++ 28 files changed, 851 insertions(+) create mode 100644 .github/workflows/main.yml create mode 100644 .gitignore create mode 100644 .rspec create mode 100644 CODE_OF_CONDUCT.md create mode 100644 Gemfile create mode 100644 Rakefile create mode 100755 bin/console create mode 100755 bin/setup create mode 100755 exe/termium create mode 100644 lib/termium.rb create mode 100644 lib/termium/abbreviation.rb create mode 100644 lib/termium/core.rb create mode 100644 lib/termium/designation_operations.rb create mode 100644 lib/termium/entry_term.rb create mode 100644 lib/termium/extract.rb create mode 100644 lib/termium/extract_language.rb create mode 100644 lib/termium/language_module.rb create mode 100644 lib/termium/parameter.rb create mode 100644 lib/termium/source.rb create mode 100644 lib/termium/source_ref.rb create mode 100644 lib/termium/subject.rb create mode 100644 lib/termium/textual_support.rb create mode 100644 lib/termium/universal_entry.rb create mode 100644 lib/termium/version.rb create mode 100644 sig/termium.rbs create mode 100644 spec/spec_helper.rb create mode 100644 spec/termium_spec.rb create mode 100644 termium.gemspec diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml new file mode 100644 index 0000000..47fd776 --- /dev/null +++ b/.github/workflows/main.yml @@ -0,0 +1,27 @@ +name: Ruby + +on: + push: + branches: + - main + + pull_request: + +jobs: + build: + runs-on: ubuntu-latest + name: Ruby ${{ matrix.ruby }} + strategy: + matrix: + ruby: + - '3.1.2' + + steps: + - uses: actions/checkout@v3 + - name: Set up Ruby + uses: ruby/setup-ruby@v1 + with: + ruby-version: ${{ matrix.ruby }} + bundler-cache: true + - name: Run the default task + run: bundle exec rake diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b04a8c8 --- /dev/null +++ b/.gitignore @@ -0,0 +1,11 @@ +/.bundle/ +/.yardoc +/_yardoc/ +/coverage/ +/doc/ +/pkg/ +/spec/reports/ +/tmp/ + +# rspec failure tracking +.rspec_status diff --git a/.rspec b/.rspec new file mode 100644 index 0000000..34c5164 --- /dev/null +++ b/.rspec @@ -0,0 +1,3 @@ +--format documentation +--color +--require spec_helper diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..d46081c --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,84 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge + +We as members, contributors, and leaders pledge to make participation in our community a harassment-free experience for everyone, regardless of age, body size, visible or invisible disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, religion, or sexual identity and orientation. + +We pledge to act and interact in ways that contribute to an open, welcoming, diverse, inclusive, and healthy community. + +## Our Standards + +Examples of behavior that contributes to a positive environment for our community include: + +* Demonstrating empathy and kindness toward other people +* Being respectful of differing opinions, viewpoints, and experiences +* Giving and gracefully accepting constructive feedback +* Accepting responsibility and apologizing to those affected by our mistakes, and learning from the experience +* Focusing on what is best not just for us as individuals, but for the overall community + +Examples of unacceptable behavior include: + +* The use of sexualized language or imagery, and sexual attention or + advances of any kind +* Trolling, insulting or derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or email + address, without their explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Enforcement Responsibilities + +Community leaders are responsible for clarifying and enforcing our standards of acceptable behavior and will take appropriate and fair corrective action in response to any behavior that they deem inappropriate, threatening, offensive, or harmful. + +Community leaders have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, and will communicate reasons for moderation decisions when appropriate. + +## Scope + +This Code of Conduct applies within all community spaces, and also applies when an individual is officially representing the community in public spaces. Examples of representing our community include using an official e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be reported to the community leaders responsible for enforcement at ronald.tse@ribose.com. All complaints will be reviewed and investigated promptly and fairly. + +All community leaders are obligated to respect the privacy and security of the reporter of any incident. + +## Enforcement Guidelines + +Community leaders will follow these Community Impact Guidelines in determining the consequences for any action they deem in violation of this Code of Conduct: + +### 1. Correction + +**Community Impact**: Use of inappropriate language or other behavior deemed unprofessional or unwelcome in the community. + +**Consequence**: A private, written warning from community leaders, providing clarity around the nature of the violation and an explanation of why the behavior was inappropriate. A public apology may be requested. + +### 2. Warning + +**Community Impact**: A violation through a single incident or series of actions. + +**Consequence**: A warning with consequences for continued behavior. No interaction with the people involved, including unsolicited interaction with those enforcing the Code of Conduct, for a specified period of time. This includes avoiding interactions in community spaces as well as external channels like social media. Violating these terms may lead to a temporary or permanent ban. + +### 3. Temporary Ban + +**Community Impact**: A serious violation of community standards, including sustained inappropriate behavior. + +**Consequence**: A temporary ban from any sort of interaction or public communication with the community for a specified period of time. No public or private interaction with the people involved, including unsolicited interaction with those enforcing the Code of Conduct, is allowed during this period. Violating these terms may lead to a permanent ban. + +### 4. Permanent Ban + +**Community Impact**: Demonstrating a pattern of violation of community standards, including sustained inappropriate behavior, harassment of an individual, or aggression toward or disparagement of classes of individuals. + +**Consequence**: A permanent ban from any sort of public interaction within the community. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 2.0, +available at https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. + +Community Impact Guidelines were inspired by [Mozilla's code of conduct enforcement ladder](https://github.com/mozilla/diversity). + +[homepage]: https://www.contributor-covenant.org + +For answers to common questions about this code of conduct, see the FAQ at +https://www.contributor-covenant.org/faq. Translations are available at https://www.contributor-covenant.org/translations. diff --git a/Gemfile b/Gemfile new file mode 100644 index 0000000..be173b2 --- /dev/null +++ b/Gemfile @@ -0,0 +1,5 @@ +# frozen_string_literal: true + +source "https://rubygems.org" + +gemspec diff --git a/Rakefile b/Rakefile new file mode 100644 index 0000000..cca7175 --- /dev/null +++ b/Rakefile @@ -0,0 +1,12 @@ +# frozen_string_literal: true + +require "bundler/gem_tasks" +require "rspec/core/rake_task" + +RSpec::Core::RakeTask.new(:spec) + +require "rubocop/rake_task" + +RuboCop::RakeTask.new + +task default: %i[spec rubocop] diff --git a/bin/console b/bin/console new file mode 100755 index 0000000..475e3f0 --- /dev/null +++ b/bin/console @@ -0,0 +1,11 @@ +#!/usr/bin/env ruby +# frozen_string_literal: true + +require "bundler/setup" +require "termium" + +# You can add fixtures and/or initialization code here to make experimenting +# with your gem easier. You can also use a different console, if you like. + +require "irb" +IRB.start(__FILE__) diff --git a/bin/setup b/bin/setup new file mode 100755 index 0000000..dce67d8 --- /dev/null +++ b/bin/setup @@ -0,0 +1,8 @@ +#!/usr/bin/env bash +set -euo pipefail +IFS=$'\n\t' +set -vx + +bundle install + +# Do any other automated setup that you need to do here diff --git a/exe/termium b/exe/termium new file mode 100755 index 0000000..f18ccfa --- /dev/null +++ b/exe/termium @@ -0,0 +1,64 @@ +#!/usr/bin/env ruby + +require_relative "../lib/termium" + +class TermiumCommand < Thor + desc "convert", "Convert Termium entries into a Glossarist dataset" + + option :input_file, aliases: :i, required: true, desc: "Path to TERMIUM Plus XML extract" + option :output_file, aliases: :o, desc: "Output file path" + + def convert + + input_path = Pathname.new(Dir.pwd).join(Pathname.new(options[:input_file])) + + unless input_path.exist? + throw StandardError.new("Input file `#{options[:input_file]}` does not exist.") + end + + puts "Reading input file: #{input_path.relative_path_from(Dir.pwd)}" + termium_extract = Termium::Extract.from_xml(IO.read(input_path.expand_path)) + + puts "Size of dataset: #{termium_extract.core.size}" + # pp termium_extract.core.first + # pp termium_extract.core.first.to_concept + + puts "Converting to Glossarist..." + glossarist_coll = termium_extract.to_concept + # pp glossarist_coll.first + + output_path = options[:output_file] + unless output_path + puts input_path.basename + output_path = input_path.dirname.join(input_path.basename(input_path.extname)) + puts output_path + end + + output_path = Pathname.new(Dir.pwd).join(Pathname.new(output_path)) + + unless output_path.exist? # and is directory + output_path.mkdir + end + + glossarist_coll.save_to_files(output_path.expand_path) + puts "Written Glossarist dataset to #{output_path.relative_path_from(Dir.pwd)}" + + # IO.write("ISO-IEC_2382.yaml", glossarist_coll.to_yaml) + end + + def method_missing(*args) + warn "No method found named: #{args[0]}" + warn "Run with `--help` or `-h` to see available options" + exit 1 + end + + def respond_to_missing? + true + end + + def self.exit_on_failure? + true + end +end + +TermiumCommand.start(ARGV) diff --git a/lib/termium.rb b/lib/termium.rb new file mode 100644 index 0000000..cd17bfe --- /dev/null +++ b/lib/termium.rb @@ -0,0 +1,26 @@ +# frozen_string_literal: true + +require 'glossarist' +require 'shale' +require 'shale/adapter/nokogiri' +Shale.xml_adapter = Shale::Adapter::Nokogiri + +module Termium + class Error < StandardError; end + +end + +require_relative "termium/version" +require_relative "termium/extract" +require_relative "termium/extract_language" +require_relative "termium/core" +require_relative "termium/abbreviation" +require_relative "termium/designation_operations" +require_relative "termium/entry_term" +require_relative "termium/language_module" +require_relative "termium/parameter" +require_relative "termium/source" +require_relative "termium/source_ref" +require_relative "termium/subject" +require_relative "termium/textual_support" +require_relative "termium/universal_entry" diff --git a/lib/termium/abbreviation.rb b/lib/termium/abbreviation.rb new file mode 100644 index 0000000..b49818b --- /dev/null +++ b/lib/termium/abbreviation.rb @@ -0,0 +1,58 @@ +require_relative 'source_ref' +require_relative 'parameter' +require_relative 'designation_operations' + +module Termium + class Abbreviation < Shale::Mapper + attribute :order, Shale::Type::Integer + attribute :value, Shale::Type::String + attribute :source_ref, SourceRef + attribute :parameter, Parameter, collection: true + include DesignationOperations + + # + # + # + # + # + + xml do + root 'abbreviation' + map_attribute 'order', to: :order + map_attribute 'value', to: :value + map_element 'sourceRef', to: :source_ref + map_element 'parameter', to: :parameter + end + + def deprecated + parameter.map(&:abbreviation).include?("AE") + end + + def to_h + set = { + "designation" => value, + "type" => "abbreviation", + "normative_status" => deprecated ? "deprecated" : "preferred" + } + + # if geographical_area + # set["geographical_area"] = geographical_area + # end + + # if plurality + # set["plurality"] = plurality + # end + + if gender + set["gender"] = gender + end + + if part_of_speech + set["part_of_speech"] = part_of_speech + end + + set + end + + end +end \ No newline at end of file diff --git a/lib/termium/core.rb b/lib/termium/core.rb new file mode 100644 index 0000000..8822078 --- /dev/null +++ b/lib/termium/core.rb @@ -0,0 +1,52 @@ +require_relative 'language_module' +require_relative 'subject' +require_relative 'universal_entry' +require_relative 'source' + +module Termium + + class Core < Shale::Mapper + attribute :identification_number, Shale::Type::String + attribute :dissemination_level, Shale::Type::String + + attribute :language_module, LanguageModule, collection: true + attribute :subject, Subject + attribute :universal_entry, UniversalEntry + attribute :source, Source, collection: true + + xml do + root 'core' + map_attribute 'disseminationLevel', to: :dissemination_level + map_attribute 'identificationNumber', to: :identification_number + map_element 'languageModule', to: :language_module + map_element 'subject', to: :subject + map_element 'universalEntry', to: :universal_entry + map_element 'source', to: :source + end + + # TODO: In Termium XML, each definition per lang or note can be linked to a + # particular source via the sourceRef number. + def concept_sources + source.map(&:to_concept_source) + end + + def to_concept + concept = Glossarist::ManagedConcept.new(id: identification_number) + + language_module.map(&:to_concept).each do |localized_concept| + # TODO: This is needed to skip the empty french entries of 10031781 and 10031778 + next if localized_concept.nil? + + localized_concept.id = identification_number + # TODO: this should just be localized_concept.notes << universal_entry.value + # TODO: Depends on https://github.com/glossarist/glossarist-ruby/issues/82 + localized_concept.notes << Glossarist::DetailedDefinition.new(universal_entry.value) + localized_concept.sources = concept_sources + concept.add_localization(localized_concept) + end + + concept + end + + end +end \ No newline at end of file diff --git a/lib/termium/designation_operations.rb b/lib/termium/designation_operations.rb new file mode 100644 index 0000000..df57c59 --- /dev/null +++ b/lib/termium/designation_operations.rb @@ -0,0 +1,31 @@ +module Termium + + module DesignationOperations + PART_OF_SPEECH_CODE_MAPPING = { + "ADJ" => "adj", + "N" => "noun", + "V" => "verb" + } + def part_of_speech + value = parameter.detect do |x| + PART_OF_SPEECH_CODE_MAPPING[x.abbreviation] + end + + value ? PART_OF_SPEECH_CODE_MAPPING[value.abbreviation] : nil + end + + GENDER_CODE_MAPPING = { + "F" => "f", + "M" => "m", + "EPI" => "c" # this means "Epicine" + } + def gender + value = parameter.detect do |x| + GENDER_CODE_MAPPING[x.abbreviation] + end + + value ? GENDER_CODE_MAPPING[value.abbreviation] : nil + end + end + +end diff --git a/lib/termium/entry_term.rb b/lib/termium/entry_term.rb new file mode 100644 index 0000000..7ced186 --- /dev/null +++ b/lib/termium/entry_term.rb @@ -0,0 +1,86 @@ +require_relative 'source_ref' +require_relative 'abbreviation' +require_relative 'parameter' +require_relative 'designation_operations' + +module Termium + class EntryTerm < Shale::Mapper + attribute :order, Shale::Type::Integer + attribute :value, Shale::Type::String + attribute :source_ref, SourceRef + attribute :abbreviation, Abbreviation, collection: true + attribute :parameter, Parameter, collection: true + include DesignationOperations + + xml do + root 'entryTerm' + map_attribute 'order', to: :order + map_attribute 'value', to: :value + map_element 'abbreviation', to: :abbreviation + map_element 'sourceRef', to: :source_ref + map_element 'parameter', to: :parameter + end + + # attr_accessor :geographical_area, + # :deprecated, + # :plurality, + # :part_of_speech, + # :gender + + GEOGRAPHICAL_CODE_MAPPING = { + "USA" => "US", + "CAN" => "CA", + "GB" => "GB", + "AUS" => "AU", + "EUR" => "EU" + } + def geographical_area + keys = GEOGRAPHICAL_CODE_MAPPING.keys + usage = parameter.select do |x| + keys.include?(x.abbreviation) + end + + return nil if usage.empty? + + usage.map do |x| + GEOGRAPHICAL_CODE_MAPPING[x.abbreviation] + end.join("; ") + end + + def deprecated + parameter.map(&:abbreviation).include?("AE") + end + + def plurality + parameter.map(&:abbreviation).include?("PL") ? + "plural" : + "singular" + end + + def to_h + set = { + "designation" => value, + "type" => "expression", + "normative_status" => deprecated ? "deprecated" : "preferred" + } + + if geographical_area + set["geographical_area"] = geographical_area + end + + if plurality + set["plurality"] = plurality + end + + if gender + set["gender"] = gender + end + + if part_of_speech + set["part_of_speech"] = part_of_speech + end + + set + end + end +end \ No newline at end of file diff --git a/lib/termium/extract.rb b/lib/termium/extract.rb new file mode 100644 index 0000000..7f1b42d --- /dev/null +++ b/lib/termium/extract.rb @@ -0,0 +1,27 @@ +require_relative 'extract_language' +require_relative 'core' + +module Termium + + class Extract < Shale::Mapper + attribute :language, Shale::Type::String + attribute :extract_language, ExtractLanguage, collection: true + attribute :core, Core, collection: true + + xml do + root 'termium_extract' + # namespace 'http://termium.tpsgc-pwgsc.gc.ca/schemas/2012/06/Termium', 'ns2' + + map_attribute 'language', to: :language + map_element 'extractLanguage', to: :extract_language + map_element 'core', to: :core + end + + def to_concept + coll = Glossarist::ManagedConceptCollection.new + coll.managed_concepts = core.map(&:to_concept) + coll + end + end +end + diff --git a/lib/termium/extract_language.rb b/lib/termium/extract_language.rb new file mode 100644 index 0000000..58cac9b --- /dev/null +++ b/lib/termium/extract_language.rb @@ -0,0 +1,12 @@ +module Termium + class ExtractLanguage < Shale::Mapper + attribute :language, Shale::Type::String + attribute :order, Shale::Type::Integer + xml do + root 'extractLanguage' + map_attribute 'language', to: :language + map_attribute 'order', to: :order + end + end + +end diff --git a/lib/termium/language_module.rb b/lib/termium/language_module.rb new file mode 100644 index 0000000..56656f2 --- /dev/null +++ b/lib/termium/language_module.rb @@ -0,0 +1,80 @@ +require_relative 'entry_term' +require_relative 'textual_support' + +module Termium + + class LanguageModule < Shale::Mapper + attribute :language, Shale::Type::String + attribute :entry_term, EntryTerm, collection: true + attribute :textual_support, TextualSupport, collection:true + xml do + root 'languageModule' + map_attribute 'language', to: :language + map_element 'entryTerm', to: :entry_term + map_element 'textualSupport', to: :textual_support + end + + def definition_raw + textual_support.detect(&:is_definition?) + end + + def definition + definition_raw ? definition_raw.value_typed : nil + end + + def domain + definition_raw ? definition_raw.domain : nil + end + + def notes + textual_support.select(&:is_note?).map(&:value_typed) + end + + def examples + textual_support.select(&:is_example?).map(&:value_typed) + end + + def abbreviations + entry_term.map(&:abbreviation).flatten + end + + LANGUAGE_CODE_MAPPING = { + "en" => "eng", + "fr" => "fre" + } + + def designations + # NOTE: entry_term is a collection + entry_term + abbreviations + end + + def to_h + lang_code = LANGUAGE_CODE_MAPPING[language.downcase] + + # TODO: This is needed to skip the empty french entries of 10031781 and 10031778 + return nil unless definition + + src = { + "language_code" => lang_code, + "terms" => designations.map(&:to_h), + "definition" => [{ + content: definition + }], + "notes" => notes, + "examples" => examples, + } + + if domain + src["domain"] = domain + end + + src + end + + def to_concept + x = to_h + return nil unless x + Glossarist::LocalizedConcept.new(x) + end + end +end \ No newline at end of file diff --git a/lib/termium/parameter.rb b/lib/termium/parameter.rb new file mode 100644 index 0000000..9259e7f --- /dev/null +++ b/lib/termium/parameter.rb @@ -0,0 +1,12 @@ +module Termium + + class Parameter < Shale::Mapper + # + attribute :abbreviation, Shale::Type::String + xml do + root 'parameter' + map_attribute 'abbreviation', to: :abbreviation + end + end + +end \ No newline at end of file diff --git a/lib/termium/source.rb b/lib/termium/source.rb new file mode 100644 index 0000000..18a2ad2 --- /dev/null +++ b/lib/termium/source.rb @@ -0,0 +1,34 @@ +module Termium + + class Source < Shale::Mapper + ISO_BIB_REGEX = /\AISO-([\d-]+)\s+\*\s+(\d{4})\s+.*/ + ISOIEC_BIB_REGEX = /\AISO-IEC-([\d-]+)\s+\*\s+(\d{4})\s+.*/ + + attribute :order, Shale::Type::Integer + attribute :details, Shale::Type::String + xml do + root 'source' + map_attribute 'order', to: :order + map_attribute 'details', to: :details + end + + def content + if matches = details.match(ISOIEC_BIB_REGEX) + return "ISO/IEC #{matches[1]}:#{matches[2]}" + elsif matches = details.match(ISO_BIB_REGEX) + return "ISO #{matches[1]}:#{matches[2]}" + else + details + end + end + + def to_concept_source + Glossarist::ConceptSource.new({ + "type" => "lineage", + "ref" => content, + "status" => "identical", + }) + end + end + +end diff --git a/lib/termium/source_ref.rb b/lib/termium/source_ref.rb new file mode 100644 index 0000000..0c29aa0 --- /dev/null +++ b/lib/termium/source_ref.rb @@ -0,0 +1,9 @@ +module Termium + class SourceRef < Shale::Mapper + attribute :order, Shale::Type::Integer + xml do + root 'sourceRef' + map_attribute 'order', to: :order + end + end +end \ No newline at end of file diff --git a/lib/termium/subject.rb b/lib/termium/subject.rb new file mode 100644 index 0000000..9d89610 --- /dev/null +++ b/lib/termium/subject.rb @@ -0,0 +1,14 @@ +module Termium + + class Subject < Shale::Mapper + attribute :abbreviation, Shale::Type::String + attribute :details, Shale::Type::String + + # + xml do + root 'subject' + map_attribute 'abbreviation', to: :abbreviation + map_attribute 'details', to: :details + end + end +end \ No newline at end of file diff --git a/lib/termium/textual_support.rb b/lib/termium/textual_support.rb new file mode 100644 index 0000000..3bec99a --- /dev/null +++ b/lib/termium/textual_support.rb @@ -0,0 +1,83 @@ +require_relative 'source_ref' + +module Termium + + class TextualSupport < Shale::Mapper + attribute :order, Shale::Type::Integer + attribute :type, Shale::Type::String + attribute :value, Shale::Type::String + attribute :source_ref, SourceRef + xml do + root 'textualSupport' + map_attribute 'order', to: :order + map_attribute 'type', to: :type + map_element 'value', to: :value + map_element 'sourceRef', to: :source_ref + end + + def value_cleaned + value.gsub(/\n\s+/, " ") + end + + def value_typed + if is_example? + value_example + elsif is_definition? + value_definition + else + value_cleaned + end + end + + EXAMPLE_REGEX = /\AEx[ea]mples?\s*:\s*/ + def is_example? + value_cleaned.match(EXAMPLE_REGEX) + end + + def is_definition? + type == "DEF" + end + + def is_note? + !is_definition? && !is_example? + end + + def value_example + value_cleaned.gsub(EXAMPLE_REGEX, '') + end + + DEFINITION_REGEX = /\A\<(.+?)\>\s*/ + def value_definition + value_cleaned.gsub(DEFINITION_REGEX, '') + end + + def has_domain? + !value_cleaned.match(DEFINITION_REGEX).nil? + end + + def domain + if has_domain? + value_cleaned.match(DEFINITION_REGEX)[1] + end + end + + # This is an attempt to extract the textual reference within the note. + # TODO: Use this to correlate the actual term with the source reference, i.e + # from the following note, the terms "abduction; inférence abductive" come from + # "ISO-IEC-2382-28-1995". + # NOTE: abduction; inférence abductive : termes et définition normalisés par l'ISO/CEI [<>]. + def source_from_note + x = note.match(/\[.*\]/) + return nil if x.nil? + + ref = x.match(/\[.*\]/).to_s.gsub(/[\[\]]/, '') + + # "[ISO/IEC 2382-13:1996; ISO/IEC 2382-24:1995]" + refs = if ref.include?(";") + ref.split("; ") + else + [ref] + end + end + end +end \ No newline at end of file diff --git a/lib/termium/universal_entry.rb b/lib/termium/universal_entry.rb new file mode 100644 index 0000000..47b7f50 --- /dev/null +++ b/lib/termium/universal_entry.rb @@ -0,0 +1,27 @@ +require_relative 'source_ref' +require_relative 'parameter' + +module Termium + + class UniversalEntry < Shale::Mapper + attribute :order, Shale::Type::Integer + attribute :value, Shale::Type::String + attribute :source_ref, SourceRef + attribute :parameter, Parameter + + # + # 09.08.09 (2382) + # + # + # + + xml do + root 'universalEntry' + map_attribute 'order', to: :order + map_element 'value', to: :value + map_element 'sourceRef', to: :source_ref + map_element 'parameter', to: :parameter + end + end + +end \ No newline at end of file diff --git a/lib/termium/version.rb b/lib/termium/version.rb new file mode 100644 index 0000000..4c1bd18 --- /dev/null +++ b/lib/termium/version.rb @@ -0,0 +1,5 @@ +# frozen_string_literal: true + +module Termium + VERSION = "0.1.0" +end diff --git a/sig/termium.rbs b/sig/termium.rbs new file mode 100644 index 0000000..b8a5f83 --- /dev/null +++ b/sig/termium.rbs @@ -0,0 +1,4 @@ +module Termium + VERSION: String + # See the writing guide of rbs: https://github.com/ruby/rbs#guides +end diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb new file mode 100644 index 0000000..5d2e67a --- /dev/null +++ b/spec/spec_helper.rb @@ -0,0 +1,15 @@ +# frozen_string_literal: true + +require "termium" + +RSpec.configure do |config| + # Enable flags like --only-failures and --next-failure + config.example_status_persistence_file_path = ".rspec_status" + + # Disable RSpec exposing methods globally on `Module` and `main` + config.disable_monkey_patching! + + config.expect_with :rspec do |c| + c.syntax = :expect + end +end diff --git a/spec/termium_spec.rb b/spec/termium_spec.rb new file mode 100644 index 0000000..2da5018 --- /dev/null +++ b/spec/termium_spec.rb @@ -0,0 +1,11 @@ +# frozen_string_literal: true + +RSpec.describe Termium do + it "has a version number" do + expect(Termium::VERSION).not_to be nil + end + + it "does something useful" do + expect(false).to eq(true) + end +end diff --git a/termium.gemspec b/termium.gemspec new file mode 100644 index 0000000..3b1d240 --- /dev/null +++ b/termium.gemspec @@ -0,0 +1,40 @@ +# frozen_string_literal: true + +require_relative "lib/termium/version" + +all_files_in_git = Dir.chdir(File.expand_path(__dir__)) do + `git ls-files -z`.split("\x0") +end + +Gem::Specification.new do |spec| + spec.name = "termium" + spec.version = Termium::VERSION + spec.authors = ["Ribose"] + spec.email = ["open.source@ribose.com"] + + spec.summary = + "Parser for the TERMIUM Plus terminology database of the Government of Canada" + spec.homepage = "https://github.com/glossarist/termium" + spec.license = "BSD-2-Clause" + spec.required_ruby_version = Gem::Requirement.new(">= 2.6.0") + + spec.metadata["homepage_uri"] = spec.homepage + spec.metadata["source_code_uri"] = spec.homepage + spec.metadata["bug_tracker_uri"] = "#{spec.homepage}/issues" + + # Specify which files should be added to the gem when it is released. + spec.files = all_files_in_git + .reject { |f| f.match(%r{\A(?:test|spec|features|bin|\.)/}) } + + spec.bindir = "exe" + spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) } + spec.require_paths = ["lib"] + + spec.add_dependency "glossarist", "~> 1.0" + spec.add_dependency "shale" + spec.add_dependency "thor" + + spec.add_development_dependency "pry", "~> 0.14.0" + spec.add_development_dependency "rake", "~> 13.0" + spec.add_development_dependency "rspec", "~> 3.10" +end