From 0e88794aaa0917faa0047512d86f297a8931fb08 Mon Sep 17 00:00:00 2001 From: Laurence Ininda Date: Mon, 14 Oct 2024 09:53:30 -0400 Subject: [PATCH] use dotnet for parsing feeds --- .github/workflows/generate.yml | 45 +- .gitignore | 50 +- FeedParser/FeedParser.csproj | 19 + FeedParser/Models/WorkingFolder.cs | 15 + FeedParser/Program.cs | 68 +++ FeedParser/packages.lock.json | 75 +++ data/podcasts_opml.xml | 878 +++++++++++++---------------- podcast-data-generator.sln | 25 + scripts/generate.py | 9 +- source/lib/helpers.ts | 5 + 10 files changed, 661 insertions(+), 528 deletions(-) create mode 100644 FeedParser/FeedParser.csproj create mode 100644 FeedParser/Models/WorkingFolder.cs create mode 100644 FeedParser/Program.cs create mode 100644 FeedParser/packages.lock.json create mode 100644 podcast-data-generator.sln diff --git a/.github/workflows/generate.yml b/.github/workflows/generate.yml index e6728ea0..691c7924 100644 --- a/.github/workflows/generate.yml +++ b/.github/workflows/generate.yml @@ -14,25 +14,24 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - node-version: [20.x] + dotnet: [8.x] + env: + NUGET_PACKAGES: ${{ github.workspace }}/.nuget/packages steps: - uses: actions/checkout@v3 - name: Setup Python uses: actions/setup-python@v5 with: python-version: '3.10' - - name: Cache node modules - uses: actions/cache@v2 - env: - cache-name: node-cache + - name: Setup dotnet + uses: actions/setup-dotnet@v4 with: - # npm cache files are stored in `~/.npm` on Linux/macOS - path: ~/.npm - key: ${{ runner.os }}-build-${{ env.cache-name }}-${{ hashFiles('**/package-lock.json') }} - restore-keys: | - ${{ runner.os }}-build-${{ env.cache-name }}- - ${{ runner.os }}-build- - ${{ runner.os }}- + dotnet-version: | + 8.x + cache: true + cache-dependency-path: FeedParser/packages.lock.json + - name: Restore Nuget Packages + run: dotnet restore FeedParser/FeedParser.csproj - uses: actions/cache@v2 env: @@ -44,31 +43,15 @@ jobs: restore-keys: | ${{ runner.os }}-pip- - - name: 'Install npm dependencies 🪨' - if: steps.node-cache.outputs.cache-hit != 'true' - run: npm install - - name: 'Install python deps 🐍' if: steps.python-cache.outputs.cache-hit != 'true' run: pip3 install -r requirements.txt - - name: 'compile 🏗️' - run: npx tsc - - - name: 'Prepare 🧑‍🍳' - run: | - mkdir dist - touch dist/rssUrls.txt - mkdir tmp - mkdir tmp/dist - touch tmp/dist/logs.md - mkdir tmp/dist/podcasts - mkdir tmp/dist/podcasts_palettes - mkdir tmp/dist/.github - mkdir tmp/dist/.github/workflows + - name: 'Build 🏗️' + run: dotnet build FeedParser/FeedParser.csproj - name: 'Generate Files 🤖' - run: node --max-old-space-size=8192 out/index.js + run: dotnet run --project FeedParser/FeedParser.csproj - name: 'Generate Palette 🎨' run: | diff --git a/.gitignore b/.gitignore index 1c66e4f8..00d474c8 100644 --- a/.gitignore +++ b/.gitignore @@ -9,4 +9,52 @@ tmp out.json # Other files -*.DS_Store \ No newline at end of file +*.DS_Store + +*.swp +*.*~ +project.lock.json +.DS_Store +*.pyc +nupkg/ + +# Visual Studio Code +.vscode/ + +# Rider +.idea/ + +# Visual Studio +.vs/ + +# Fleet +.fleet/ + +# Code Rush +.cr/ + +# User-specific files +*.suo +*.user +*.userosscache +*.sln.docstates + +# Build results +[Dd]ebug/ +[Dd]ebugPublic/ +[Rr]elease/ +[Rr]eleases/ +x64/ +x86/ +build/ +bld/ +[Bb]in/ +[Oo]bj/ +[Oo]ut/ +msbuild.log +msbuild.err +msbuild.wrn + +# python virtual env +venv/ +path/to/venv/ \ No newline at end of file diff --git a/FeedParser/FeedParser.csproj b/FeedParser/FeedParser.csproj new file mode 100644 index 00000000..258d0a5f --- /dev/null +++ b/FeedParser/FeedParser.csproj @@ -0,0 +1,19 @@ + + + + Exe + net8.0 + enable + enable + true + + + + + + + + + + + diff --git a/FeedParser/Models/WorkingFolder.cs b/FeedParser/Models/WorkingFolder.cs new file mode 100644 index 00000000..8f187893 --- /dev/null +++ b/FeedParser/Models/WorkingFolder.cs @@ -0,0 +1,15 @@ +namespace PodcastDataGenerator.Models +{ + public class WorkingFolder + { + public string Path {get; set;} + public List SubFolderPaths { get; set;} + public List FilesToGenerate { get; set;} + public WorkingFolder(string path, List subFolderPaths, List filesToGenerate) + { + Path = path; + SubFolderPaths = subFolderPaths; + FilesToGenerate = filesToGenerate; + } + } +} \ No newline at end of file diff --git a/FeedParser/Program.cs b/FeedParser/Program.cs new file mode 100644 index 00000000..526e3eed --- /dev/null +++ b/FeedParser/Program.cs @@ -0,0 +1,68 @@ +// See https://aka.ms/new-console-template for more information +using System.Xml; +using CodeHollow.FeedReader; +using Newtonsoft.Json; +using OPMLCore.NET; +using PodcastDataGenerator.Models; +using Slugify; +using Syndication.Parser; + +var workingDir = Directory.GetCurrentDirectory();//(Directory.GetCurrentDirectory()); + +List folders = [ + new WorkingFolder("tmp/dist", ["podcasts","podcasts_palettes",".github/workflows"], ["tmp/dist/logs.md"]), + new WorkingFolder("dist", [], ["dist/rssUrls.txt"]) +]; + +folders.ForEach(folder => +{ + if (Directory.Exists($"{workingDir}/{folder.Path}")) + { + Directory.Delete($"{workingDir}/{folder.Path}", true); + } + Directory.CreateDirectory($"{workingDir}/{folder.Path}"); + folder.SubFolderPaths.ForEach(subFolderPath => Directory.CreateDirectory($"{workingDir}/{folder.Path}/{subFolderPath}")); + folder.FilesToGenerate.ForEach(fileName => { + using(File.Create($"{workingDir}/{fileName}")){} + }); +}); + + +SlugHelper helper = new SlugHelper(); + +Opml opml = new Opml($"{workingDir}/data/podcasts_opml.xml"); + +foreach (Outline outline in opml.Body.Outlines) +{ + Console.WriteLine(outline.Text); +} + +async Task ParseFeed(string xmlUrl) +{ + try + { + var parsedFeed = await FeedReader.ReadAsync(xmlUrl); + Console.WriteLine($"Parsed Feed {parsedFeed.Title}"); + // Write to file + XmlDocument xmlDocument = new XmlDocument(); + xmlDocument.LoadXml(parsedFeed.OriginalDocument); + var feedAsJson = JsonConvert.SerializeXmlNode(xmlDocument, Newtonsoft.Json.Formatting.Indented); + File.WriteAllText($"{workingDir}/tmp/dist/podcasts/{helper.GenerateSlug(parsedFeed.Title)}.json", feedAsJson); + + return parsedFeed; + + } + catch (System.Exception err) + { + + Console.ForegroundColor = ConsoleColor.Red; + Console.WriteLine($"Error fetching feed: {xmlUrl}: {err.Message}"); + Console.ResetColor(); + return null; + } +} + +var feeds = opml.Body.Outlines.First().Outlines; +File.AppendAllLines($"{workingDir}/dist/rssUrls.txt",feeds.Select(feed => feed.XMLUrl)); +var parsedFeeds = feeds.Select(async feedItem => await ParseFeed(feedItem.XMLUrl)); +await Task.WhenAll(parsedFeeds); diff --git a/FeedParser/packages.lock.json b/FeedParser/packages.lock.json new file mode 100644 index 00000000..f24639ee --- /dev/null +++ b/FeedParser/packages.lock.json @@ -0,0 +1,75 @@ +{ + "version": 1, + "dependencies": { + "net8.0": { + "CodeHollow.FeedReader": { + "type": "Direct", + "requested": "[1.2.6, )", + "resolved": "1.2.6", + "contentHash": "KpnP1zlX5zk58PUrREYUXC/7gk8ljjS8mkPfmFeuofaJlHJm62990cZdWLiMBuwWhV/HA6NAu9Ck5uM7GOJl0A==" + }, + "Newtonsoft.Json": { + "type": "Direct", + "requested": "[13.0.3, )", + "resolved": "13.0.3", + "contentHash": "HrC5BXdl00IP9zeV+0Z848QWPAoCr9P3bDEZguI+gkLcBKAOxix/tLEAAHC+UvDNPv4a2d18lOReHMOagPa+zQ==" + }, + "OPMLCore.NET": { + "type": "Direct", + "requested": "[1.0.0, )", + "resolved": "1.0.0", + "contentHash": "7glfVDnpazLC1FjtJ3hH82yRKz9mJTd+9w25V43wZdnUAw/7fIfIzTr0DNj7wZmTMLp/d5+xo4VzYPNtvvNAdA==" + }, + "Slugify.Core": { + "type": "Direct", + "requested": "[4.0.1, )", + "resolved": "4.0.1", + "contentHash": "0YKMECQGdi7O4T1SL1IFaXWSPJdXNFMoGUdxMPsqI+TBuKlhiTD7mBxRH8gd9WD3MJSTEqyF0WsGl7l+NpoXXA==", + "dependencies": { + "System.Memory": "4.5.5", + "System.Text.Encoding.CodePages": "6.0.0" + } + }, + "SyndicationLib": { + "type": "Direct", + "requested": "[0.2.0-beta, )", + "resolved": "0.2.0-beta", + "contentHash": "eqcfwrpj7qrSs0k+e0LQKtTz4qfTaXKuR0rwp4QHeuoyRScd76MOLyETkZwyLUYVz0PI/+FoW1nKpPOftdhL1g==", + "dependencies": { + "Brackets": "0.6.2" + } + }, + "Brackets": { + "type": "Transitive", + "resolved": "0.6.2", + "contentHash": "+6WZO/OgUiVuDaS10NAPOJpd7Gg+F2eiGW+mzcG4CEHxlUbv8p1ENZDVtVeUpUZLRbFk52cKUQZuMVUYVyPvrQ==", + "dependencies": { + "System.IO.Pipelines": "8.0.0" + } + }, + "System.IO.Pipelines": { + "type": "Transitive", + "resolved": "8.0.0", + "contentHash": "FHNOatmUq0sqJOkTx+UF/9YK1f180cnW5FVqnQMvYUN0elp6wFzbtPSiqbo1/ru8ICp43JM1i7kKkk6GsNGHlA==" + }, + "System.Memory": { + "type": "Transitive", + "resolved": "4.5.5", + "contentHash": "XIWiDvKPXaTveaB7HVganDlOCRoj03l+jrwNvcge/t8vhGYKvqV+dMv6G4SAX2NoNmN0wZfVPTAlFwZcZvVOUw==" + }, + "System.Runtime.CompilerServices.Unsafe": { + "type": "Transitive", + "resolved": "6.0.0", + "contentHash": "/iUeP3tq1S0XdNNoMz5C9twLSrM/TH+qElHkXWaPvuNOt+99G75NrV0OS2EqHx5wMN7popYjpc8oTjC1y16DLg==" + }, + "System.Text.Encoding.CodePages": { + "type": "Transitive", + "resolved": "6.0.0", + "contentHash": "ZFCILZuOvtKPauZ/j/swhvw68ZRi9ATCfvGbk1QfydmcXBkIWecWKn/250UH7rahZ5OoDBaiAudJtPvLwzw85A==", + "dependencies": { + "System.Runtime.CompilerServices.Unsafe": "6.0.0" + } + } + } + } +} \ No newline at end of file diff --git a/data/podcasts_opml.xml b/data/podcasts_opml.xml index 6f2dafbe..90a87c76 100644 --- a/data/podcasts_opml.xml +++ b/data/podcasts_opml.xml @@ -5,512 +5,406 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - + + + + + + + + + + - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + - - - - - - - - - - - - - + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - + + + + + \ No newline at end of file diff --git a/podcast-data-generator.sln b/podcast-data-generator.sln new file mode 100644 index 00000000..9fb388e5 --- /dev/null +++ b/podcast-data-generator.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 17 +VisualStudioVersion = 17.5.002.0 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "FeedParser", "FeedParser\FeedParser.csproj", "{2FB45180-AA83-44B6-AB9F-88BB105780FE}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Any CPU = Debug|Any CPU + Release|Any CPU = Release|Any CPU + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {2FB45180-AA83-44B6-AB9F-88BB105780FE}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {2FB45180-AA83-44B6-AB9F-88BB105780FE}.Debug|Any CPU.Build.0 = Debug|Any CPU + {2FB45180-AA83-44B6-AB9F-88BB105780FE}.Release|Any CPU.ActiveCfg = Release|Any CPU + {2FB45180-AA83-44B6-AB9F-88BB105780FE}.Release|Any CPU.Build.0 = Release|Any CPU + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {74C6B843-EA6C-4E1E-8D4E-86DB8890EC82} + EndGlobalSection +EndGlobal diff --git a/scripts/generate.py b/scripts/generate.py index e4cf8881..b2e4a1c6 100644 --- a/scripts/generate.py +++ b/scripts/generate.py @@ -59,16 +59,17 @@ def generate_color_palette(podcast_image): data = podFile.read() podcast = json.loads(data) palette = [] - if('image' in podcast): - palette = generate_color_palette(podcast['image']) - podcast['palette'] = palette + if('image' in podcast['rss']['channel']): + print('Generating palette for: ', podcast['rss']['channel']['image']) + palette = generate_color_palette(podcast['rss']['channel']['image']) + podcast['rss']['channel']['image']['palette'] = palette filepath = dist_directory + '/podcasts_palettes/' + podcastFile[:-5]+ '_with_palettes.json' with open(filepath,'w') as outputFile: outputFile.write(simplejson.dumps(podcast, indent=4, sort_keys=True)) outputFile.close() - print("Generated palette for: %s - %.2f%% done" % (podcast['title'], ((index+1)/len(podcastFileNames)*100))) + print("Generated palette for: %s - %.2f%% done" % (podcast['rss']['channel']['title'], ((index+1)/len(podcastFileNames)*100))) print('\n🦩-------------Palette Generation Complete-------------🦜\n\n') diff --git a/source/lib/helpers.ts b/source/lib/helpers.ts index e6a7cc80..470d8f7a 100644 --- a/source/lib/helpers.ts +++ b/source/lib/helpers.ts @@ -12,6 +12,11 @@ const nerParser = new EntityRecognizer({ installPath: 'tmp/stanford-ner-2020-11-17' }) +/** + * + * @param filePath The File path of the OPML File + * @returns A list of RSS Feeds from the OPML File + */ async function getRssFeedsFromOPML(filePath: string): Promise { const opmlContent = fs.readFileSync(filePath, 'utf8') const jsonData = await opmlToJSON(opmlContent)