From e7354e0ad1e041f1a8486453ae832aa2fac4f4a9 Mon Sep 17 00:00:00 2001 From: FliegendeWurst <2012gdwu+github@posteo.de> Date: Wed, 2 Jun 2021 11:13:13 +0200 Subject: [PATCH] Split downloading logic into modules --- Cargo.lock | 6 +- Cargo.toml | 1 + rustfmt.toml | 2 +- src/cli.rs | 24 +- src/ilias.rs | 75 +++-- src/ilias/course.rs | 50 ++++ src/ilias/exercise.rs | 66 +++++ src/ilias/file.rs | 22 ++ src/ilias/folder.rs | 28 ++ src/ilias/forum.rs | 100 +++++++ src/ilias/plugin_dispatch.rs | 87 ++++++ src/ilias/thread.rs | 145 +++++++++ src/ilias/video.rs | 57 ++++ src/ilias/weblink.rs | 70 +++++ src/main.rs | 555 +++-------------------------------- src/queue.rs | 50 ++++ src/util.rs | 21 +- 17 files changed, 806 insertions(+), 553 deletions(-) create mode 100644 src/ilias/course.rs create mode 100644 src/ilias/exercise.rs create mode 100644 src/ilias/file.rs create mode 100644 src/ilias/folder.rs create mode 100644 src/ilias/forum.rs create mode 100644 src/ilias/plugin_dispatch.rs create mode 100644 src/ilias/thread.rs create mode 100644 src/ilias/video.rs create mode 100644 src/ilias/weblink.rs create mode 100644 src/queue.rs diff --git a/Cargo.lock b/Cargo.lock index 2ee73f1..22f2023 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8,6 +8,7 @@ version = "0.2.21" dependencies = [ "anyhow", "atty", + "bytes", "cfg-if", "colored", "cookie_store 0.14.1", @@ -355,11 +356,10 @@ dependencies = [ [[package]] name = "crossbeam-utils" -version = "0.8.4" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4feb231f0d4d6af81aed15928e58ecf5816aa62a2393e2c82f46973e92a9a278" +checksum = "d82cfc11ce7f2c3faef78d8a684447b40d503d9681acebed6cb728d45940c4db" dependencies = [ - "autocfg", "cfg-if", "lazy_static", ] diff --git a/Cargo.toml b/Cargo.toml index 3c81f60..6df67db 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -32,6 +32,7 @@ atty = "0.2.14" h2 = "0.3.3" cookie_store = "0.14.0" reqwest_cookie_store = "0.1.5" +bytes = "1.0.1" [features] default = [] diff --git a/rustfmt.toml b/rustfmt.toml index 9345fb0..350f6e3 100644 --- a/rustfmt.toml +++ b/rustfmt.toml @@ -1,3 +1,3 @@ hard_tabs = true match_block_trailing_comma = true -max_width = 145 +max_width = 120 diff --git a/src/cli.rs b/src/cli.rs index d6f952d..faf4f5a 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -6,8 +6,6 @@ use std::sync::atomic::{AtomicBool, AtomicUsize}; #[cfg(feature = "keyring-auth")] use anyhow::anyhow; use anyhow::{Context, Result}; -#[cfg(feature = "keyring-auth")] -use colored::Colorize as _; use indicatif::ProgressBar; use once_cell::sync::Lazy; use structopt::StructOpt; @@ -87,6 +85,8 @@ pub static PROGRESS_BAR: Lazy = Lazy::new(|| ProgressBar::new(0)); macro_rules! log { ($lvl:expr, $($t:expr),+) => {{ + #[allow(unused_imports)] + use colored::Colorize as _; #[allow(unused_comparisons)] // 0 <= 0 if $lvl <= crate::cli::LOG_LEVEL.load(std::sync::atomic::Ordering::SeqCst) { if crate::cli::PROGRESS_BAR_ENABLED.load(std::sync::atomic::Ordering::SeqCst) { @@ -111,21 +111,21 @@ macro_rules! success { } macro_rules! warning { - ($e:expr) => { + ($e:expr) => {{ log!(0, "Warning: {}", format!("{:?}", $e).bright_yellow()); - }; - ($msg:expr, $e:expr) => { + }}; + ($msg:expr, $e:expr) => {{ log!(0, "Warning: {}", format!("{} {:?}", $msg, $e).bright_yellow()); - }; - ($msg1:expr, $msg2:expr, $e:expr) => { + }}; + ($msg1:expr, $msg2:expr, $e:expr) => {{ log!(0, "Warning: {}", format!("{} {} {:?}", $msg1, $msg2, $e).bright_yellow()); - }; - (format => $($e:expr),+) => { + }}; + (format => $($e:expr),+) => {{ log!(0, "Warning: {}", format!($($e),+).bright_yellow()); - }; - ($lvl:expr; $($e:expr),+) => { + }}; + ($lvl:expr; $($e:expr),+) => {{ log!($lvl, "Warning: {}", format!($($e),+).bright_yellow()); - } + }}; } macro_rules! error { diff --git a/src/ilias.rs b/src/ilias.rs index 5bf342b..ec497d9 100644 --- a/src/ilias.rs +++ b/src/ilias.rs @@ -3,15 +3,32 @@ use std::{error::Error as _, io::Write, sync::Arc}; use anyhow::{anyhow, Context, Result}; -use colored::Colorize; use cookie_store::CookieStore; use ignore::gitignore::Gitignore; +use once_cell::sync::Lazy; use reqwest::{Client, IntoUrl, Proxy, Url}; use reqwest_cookie_store::CookieStoreMutex; use scraper::{ElementRef, Html, Selector}; use serde_json::json; -use crate::{cli::Opt, get_request_ticket, selectors::*, ILIAS_URL}; +use crate::{cli::Opt, queue, ILIAS_URL}; + +pub mod course; +pub mod exercise; +pub mod file; +pub mod folder; +pub mod forum; +pub mod plugin_dispatch; +pub mod thread; +pub mod video; +pub mod weblink; + +static LINKS: Lazy = Lazy::new(|| Selector::parse("a").unwrap()); +static ALERT_DANGER: Lazy = Lazy::new(|| Selector::parse("div.alert-danger").unwrap()); +static IL_CONTENT_CONTAINER: Lazy = Lazy::new(|| Selector::parse("#il_center_col").unwrap()); +static ITEM_PROP: Lazy = Lazy::new(|| Selector::parse("span.il_ItemProperty").unwrap()); +static CONTAINER_ITEMS: Lazy = Lazy::new(|| Selector::parse("div.il_ContainerListItem").unwrap()); +static CONTAINER_ITEM_TITLE: Lazy = Lazy::new(|| Selector::parse("a.il_ContainerItemTitle").unwrap()); pub struct ILIAS { pub opt: Opt, @@ -38,10 +55,9 @@ fn error_is_http2(error: &reqwest::Error) -> bool { impl ILIAS { // TODO: de-duplicate the logic below pub async fn with_session(opt: Opt, session: Arc, ignore: Gitignore) -> Result { - let mut builder = - Client::builder() - .cookie_provider(Arc::clone(&session)) - .user_agent(concat!(env!("CARGO_PKG_NAME"), "/", env!("CARGO_PKG_VERSION"))); + let mut builder = Client::builder() + .cookie_provider(Arc::clone(&session)) + .user_agent(concat!(env!("CARGO_PKG_NAME"), "/", env!("CARGO_PKG_VERSION"))); if let Some(proxy) = opt.proxy.as_ref() { let proxy = Proxy::all(proxy)?; builder = builder.proxy(proxy); @@ -62,11 +78,9 @@ impl ILIAS { let cookie_store = CookieStore::default(); let cookie_store = reqwest_cookie_store::CookieStoreMutex::new(cookie_store); let cookie_store = std::sync::Arc::new(cookie_store); - let mut builder = Client::builder().cookie_provider(Arc::clone(&cookie_store)).user_agent(concat!( - env!("CARGO_PKG_NAME"), - "/", - env!("CARGO_PKG_VERSION") - )); + let mut builder = Client::builder() + .cookie_provider(Arc::clone(&cookie_store)) + .user_agent(concat!(env!("CARGO_PKG_NAME"), "/", env!("CARGO_PKG_VERSION"))); if let Some(proxy) = opt.proxy.as_ref() { let proxy = Proxy::all(proxy)?; builder = builder.proxy(proxy); @@ -118,7 +132,10 @@ impl ILIAS { .await?; let dom = Html::parse_document(&login_response); let saml = Selector::parse(r#"input[name="SAMLResponse"]"#).unwrap(); - let saml = dom.select(&saml).next().context("no SAML response, incorrect password?")?; + let saml = dom + .select(&saml) + .next() + .context("no SAML response, incorrect password?")?; let relay_state = Selector::parse(r#"input[name="RelayState"]"#).unwrap(); let relay_state = dom.select(&relay_state).next().context("no relay state")?; info!("Logging into ILIAS.."); @@ -136,7 +153,9 @@ impl ILIAS { pub async fn save_session(&self) -> Result<()> { let session_path = self.opt.output.join(".iliassession"); - let mut writer = std::fs::File::create(session_path).map(std::io::BufWriter::new).unwrap(); + let mut writer = std::fs::File::create(session_path) + .map(std::io::BufWriter::new) + .unwrap(); let store = self.cookies.lock().map_err(|x| anyhow!("{}", x))?; // save all cookies, including session cookies for cookie in store.iter_unexpired().map(serde_json::to_string) { @@ -147,7 +166,7 @@ impl ILIAS { } pub async fn download(&self, url: &str) -> Result { - get_request_ticket().await; + queue::get_request_ticket().await; log!(2, "Downloading {}", url); let url = if url.starts_with("http://") || url.starts_with("https://") { url.to_owned() @@ -171,7 +190,7 @@ impl ILIAS { } pub async fn head(&self, url: U) -> Result { - get_request_ticket().await; + queue::get_request_ticket().await; let url = url.into_url()?; for attempt in 1..10 { let result = self.client.head(url.clone()).send().await; @@ -199,7 +218,7 @@ impl ILIAS { } let text = self.download(url).await?.text().await?; let html = Html::parse_document(&text); - if html.select(&alert_danger).next().is_some() { + if html.select(&ALERT_DANGER).next().is_some() { Err(anyhow!("ILIAS error")) } else { Ok(html) @@ -209,7 +228,7 @@ impl ILIAS { pub async fn get_html_fragment(&self, url: &str) -> Result { let text = self.download(url).await?.text().await?; let html = Html::parse_fragment(&text); - if html.select(&alert_danger).next().is_some() { + if html.select(&ALERT_DANGER).next().is_some() { Err(anyhow!("ILIAS error")) } else { Ok(html) @@ -217,9 +236,11 @@ impl ILIAS { } pub fn get_items(html: &Html) -> Vec> { - html.select(&container_items) + html.select(&CONTAINER_ITEMS) .flat_map(|item| { - item.select(&container_item_title).next().map(|link| Object::from_link(item, link)) + item.select(&CONTAINER_ITEM_TITLE) + .next() + .map(|link| Object::from_link(item, link)) // items without links are ignored }) .collect() @@ -229,7 +250,7 @@ impl ILIAS { pub async fn get_course_content(&self, url: &URL) -> Result<(Vec>, Option)> { let html = self.get_html(&url.url).await?; - let main_text = if let Some(el) = html.select(&il_content_container).next() { + let main_text = if let Some(el) = html.select(&IL_CONTENT_CONTAINER).next() { if !el .children() .flat_map(|x| x.value().as_element()) @@ -268,6 +289,10 @@ impl ILIAS { } } +trait IliasObject { + fn download(ilias: Arc) -> Result<()>; +} + #[derive(Debug)] pub enum Object { Course { name: String, url: URL }, @@ -405,9 +430,15 @@ impl Object { // download page containing metadata return Ok(Generic { name, url }); } else { - let mut item_props = item.context("can't construct file object without HTML object")?.select(&item_prop); + let mut item_props = item + .context("can't construct file object without HTML object")? + .select(&ITEM_PROP); let ext = item_props.next().context("cannot find file extension")?; - let version = item_props.nth(1).context("cannot find 3rd file metadata")?.text().collect::(); + let version = item_props + .nth(1) + .context("cannot find 3rd file metadata")? + .text() + .collect::(); let version = version.trim(); if let Some(v) = version.strip_prefix("Version: ") { name += "_v"; diff --git a/src/ilias/course.rs b/src/ilias/course.rs new file mode 100644 index 0000000..211b365 --- /dev/null +++ b/src/ilias/course.rs @@ -0,0 +1,50 @@ +use std::{path::PathBuf, sync::Arc}; + +use anyhow::{Context, Result}; +use once_cell::sync::Lazy; +use regex::Regex; + +use crate::{ + process_gracefully, + queue::spawn, + util::{file_escape, write_file_data}, +}; + +use super::{ILIAS, URL}; + +static CMD_NODE_REGEX: Lazy = Lazy::new(|| Regex::new(r#"cmdNode=uf:\w\w"#).unwrap()); + +pub async fn download(path: PathBuf, ilias: Arc, url: &URL, name: &str) -> Result<()> { + let content = if ilias.opt.content_tree { + let html = ilias.download(&url.url).await?.text().await?; + let cmd_node = CMD_NODE_REGEX.find(&html).context("can't find cmdNode")?.as_str()[8..].to_owned(); + let content_tree = ilias.get_course_content_tree(&url.ref_id, &cmd_node).await; + match content_tree { + Ok(tree) => (tree.into_iter().map(Result::Ok).collect(), None), + Err(e) => { + // some folders are hidden on the course page and can only be found via the RSS feed / recent activity / content tree sidebar + // TODO: this is probably never the case for folders? + if html.contains(r#"input[name="cmd[join]""#) { + return Ok(()); // ignore groups we are not in + } + warning!(name, "falling back to incomplete course content extractor!", e); + ilias.get_course_content(&url).await? // TODO: perhaps don't download almost the same content 3x + }, + } + } else { + ilias.get_course_content(&url).await? + }; + if let Some(s) = content.1.as_ref() { + let path = path.join("course.html"); + write_file_data(&path, &mut s.as_bytes()) + .await + .context("failed to write course page html")?; + } + for item in content.0 { + let item = item?; + let path = path.join(file_escape(item.name())); + let ilias = Arc::clone(&ilias); + spawn(process_gracefully(ilias, path, item)); + } + Ok(()) +} diff --git a/src/ilias/exercise.rs b/src/ilias/exercise.rs new file mode 100644 index 0000000..b2f48c2 --- /dev/null +++ b/src/ilias/exercise.rs @@ -0,0 +1,66 @@ +use std::{collections::HashSet, path::Path, sync::Arc}; + +use anyhow::{Context, Result}; +use once_cell::sync::Lazy; +use scraper::Selector; + +use crate::{process_gracefully, queue::spawn, util::file_escape}; + +use super::{Object, ILIAS, URL}; + +static LINKS: Lazy = Lazy::new(|| Selector::parse("a").unwrap()); +static FORM_GROUP: Lazy = Lazy::new(|| Selector::parse(".form-group").unwrap()); +static FORM_NAME: Lazy = Lazy::new(|| Selector::parse(".il_InfoScreenProperty").unwrap()); + +pub async fn download(path: &Path, ilias: Arc, url: &URL) -> Result<()> { + let html = ilias.get_html(&url.url).await?; + let mut filenames = HashSet::new(); + for row in html.select(&FORM_GROUP) { + let link = row.select(&LINKS).next(); + if link.is_none() { + continue; + } + let link = link.unwrap(); + let href = link.value().attr("href"); + if href.is_none() { + continue; + } + let href = href.unwrap(); + let url = URL::from_href(href)?; + let cmd = url.cmd.as_deref().unwrap_or(""); + if cmd != "downloadFile" && cmd != "downloadGlobalFeedbackFile" && cmd != "downloadFeedbackFile" { + continue; + } + // link is definitely just a download link to the exercise or the solution + let name = row + .select(&FORM_NAME) + .next() + .context("link without file name")? + .text() + .collect::() + .trim() + .to_owned(); + let item = Object::File { url, name }; + let mut path = path.to_owned(); + // handle files with the same name + let filename = file_escape(item.name()); + let mut parts = filename.rsplitn(2, '.'); + let extension = parts.next().unwrap_or(&filename); + let name = parts.next().unwrap_or(""); + let mut unique_filename = filename.clone(); + let mut i = 1; + while filenames.contains(&unique_filename) { + i += 1; + if name.is_empty() { + unique_filename = format!("{}{}", extension, i); + } else { + unique_filename = format!("{}{}.{}", name, i, extension); + } + } + filenames.insert(unique_filename.clone()); + path.push(unique_filename); + let ilias = Arc::clone(&ilias); + spawn(process_gracefully(ilias, path, item)); + } + Ok(()) +} diff --git a/src/ilias/file.rs b/src/ilias/file.rs new file mode 100644 index 0000000..1df7b2a --- /dev/null +++ b/src/ilias/file.rs @@ -0,0 +1,22 @@ +use std::{path::Path, sync::Arc}; + +use anyhow::Result; +use tokio::fs; + +use crate::util::write_stream_to_file; + +use super::{ILIAS, URL}; + +pub async fn download(path: &Path, relative_path: &Path, ilias: Arc, url: &URL) -> Result<()> { + if ilias.opt.skip_files { + return Ok(()); + } + if !ilias.opt.force && fs::metadata(&path).await.is_ok() { + log!(2, "Skipping download, file exists already"); + return Ok(()); + } + let data = ilias.download(&url.url).await?; + log!(0, "Writing {}", relative_path.to_string_lossy()); + write_stream_to_file(&path, data.bytes_stream()).await?; + Ok(()) +} diff --git a/src/ilias/folder.rs b/src/ilias/folder.rs new file mode 100644 index 0000000..2948c18 --- /dev/null +++ b/src/ilias/folder.rs @@ -0,0 +1,28 @@ +use std::{path::Path, sync::Arc}; + +use anyhow::{Context, Result}; + +use crate::{ + process_gracefully, + queue::spawn, + util::{file_escape, write_file_data}, +}; + +use super::{ILIAS, URL}; + +pub async fn download(path: &Path, ilias: Arc, url: &URL) -> Result<()> { + let content = ilias.get_course_content(&url).await?; + if let Some(s) = content.1.as_ref() { + let path = path.join("folder.html"); + write_file_data(&path, &mut s.as_bytes()) + .await + .context("failed to write folder page html")?; + } + for item in content.0 { + let item = item?; + let path = path.join(file_escape(item.name())); + let ilias = Arc::clone(&ilias); + spawn(process_gracefully(ilias, path, item)); + } + Ok(()) +} diff --git a/src/ilias/forum.rs b/src/ilias/forum.rs new file mode 100644 index 0000000..0e141bc --- /dev/null +++ b/src/ilias/forum.rs @@ -0,0 +1,100 @@ +use std::{path::Path, sync::Arc}; + +use anyhow::{Context, Result}; +use once_cell::sync::Lazy; +use scraper::{Html, Selector}; + +use crate::{ilias::Object, process_gracefully, queue::spawn, util::file_escape}; + +use super::{ILIAS, URL}; + +static LINKS: Lazy = Lazy::new(|| Selector::parse("a").unwrap()); +static TABLE_HEADER: Lazy = Lazy::new(|| Selector::parse("th").unwrap()); +static TABLE_ROW: Lazy = Lazy::new(|| Selector::parse("tr").unwrap()); +static TABLE_CELLS: Lazy = Lazy::new(|| Selector::parse("td").unwrap()); + +static FORUM_PAGES: Lazy = + Lazy::new(|| Selector::parse("div.ilTableNav > table > tbody > tr > td > a").unwrap()); + +const NO_ENTRIES: &str = "Keine Einträge"; + +pub async fn download(path: &Path, ilias: Arc, url: &URL) -> Result<()> { + if !ilias.opt.forum { + return Ok(()); + } + let url = &url.url; + let html = { + let data = ilias.download(url); + let html_text = data.await?.text().await?; + let url = { + let html = Html::parse_document(&html_text); + let thread_count_selector = html + .select(&LINKS) + .flat_map(|x| x.value().attr("href")) + .find(|x| x.contains("trows=800")); + if thread_count_selector.is_none() { + if let Some(cell) = html.select(&TABLE_CELLS).next() { + if cell.text().any(|x| x == NO_ENTRIES) { + return Ok(()); // empty forum + } + } + } + thread_count_selector + .context("can't find forum thread count selector (empty forum?)")? + .to_owned() + }; + let data = ilias.download(&url); + let html = data.await?.text().await?; + Html::parse_document(&html) + }; + for row in html.select(&TABLE_ROW) { + if row.value().attr("class") == Some("hidden-print") { + continue; // thread count + } + if row.select(&TABLE_HEADER).next().is_some() { + continue; + } + let cells = row.select(&TABLE_CELLS).collect::>(); + if cells.len() != 6 { + warning!(format => + "Warning: {}{} {} {}", + "unusual table row (", cells.len(), "cells) in", url.to_string() + ); + continue; + } + let link = cells[1].select(&LINKS).next().context("thread link not found")?; + let object = Object::from_link(link, link)?; + let mut path = path.to_owned(); + let name = format!( + "{}_{}", + object.url().thr_pk.as_ref().context("thr_pk not found for thread")?, + link.text().collect::().trim() + ); + path.push(file_escape(&name)); + // FIXME: this heuristic no longer works after downloading attachments + // TODO: set modification date? + let saved_posts = { + match std::fs::read_dir(&path) { + // TODO: make this async + Ok(stream) => stream.count(), + Err(_) => 0, + } + }; + let available_posts = cells[3] + .text() + .next() + .unwrap_or_default() + .trim() + .parse::() + .context("parsing post count failed")?; + if available_posts <= saved_posts && !ilias.opt.force { + continue; + } + let ilias = Arc::clone(&ilias); + spawn(process_gracefully(ilias, path, object)); + } + if html.select(&FORUM_PAGES).count() > 0 { + log!(0, "Ignoring older threads in {:?}..", path); + } + Ok(()) +} diff --git a/src/ilias/plugin_dispatch.rs b/src/ilias/plugin_dispatch.rs new file mode 100644 index 0000000..f5bdaa2 --- /dev/null +++ b/src/ilias/plugin_dispatch.rs @@ -0,0 +1,87 @@ +use std::{path::Path, sync::Arc}; + +use anyhow::{Context, Result}; +use once_cell::sync::Lazy; +use reqwest::Url; +use scraper::{Html, Selector}; + +use crate::{ilias::Object, process_gracefully, queue::spawn, util::file_escape, ILIAS_URL}; + +use super::{ILIAS, URL}; + +static LINKS: Lazy = Lazy::new(|| Selector::parse("a").unwrap()); +static A_TARGET_BLANK: Lazy = Lazy::new(|| Selector::parse(r#"a[target="_blank"]"#).unwrap()); +static VIDEO_ROWS: Lazy = Lazy::new(|| Selector::parse(".ilTableOuter > div > table > tbody > tr").unwrap()); +static TABLE_CELLS: Lazy = Lazy::new(|| Selector::parse("td").unwrap()); + +const NO_ENTRIES: &str = "Keine Einträge"; + +pub async fn download(path: &Path, ilias: Arc, url: &URL) -> Result<()> { + if ilias.opt.no_videos { + return Ok(()); + } + let full_url = { + // first find the link to full video list + let list_url = format!("{}ilias.php?ref_id={}&cmdClass=xocteventgui&cmdNode=nc:n4:14u&baseClass=ilObjPluginDispatchGUI&lang=de&limit=20&cmd=asyncGetTableGUI&cmdMode=asynch", ILIAS_URL, url.ref_id); + log!(1, "Loading {}", list_url); + let data = ilias.download(&list_url).await?; + let html = data.text().await?; + let html = Html::parse_fragment(&html); + html.select(&LINKS) + .filter_map(|link| link.value().attr("href")) + .filter(|href| href.contains("trows=800")) + .map(|x| x.to_string()) + .next() + .context("video list link not found")? + }; + log!(1, "Rewriting {}", full_url); + let mut full_url = Url::parse(&format!("{}{}", ILIAS_URL, full_url))?; + let mut query_parameters = full_url + .query_pairs() + .map(|(x, y)| (x.into_owned(), y.into_owned())) + .collect::>(); + for (key, value) in &mut query_parameters { + match key.as_ref() { + "cmd" => *value = "asyncGetTableGUI".into(), + "cmdClass" => *value = "xocteventgui".into(), + _ => {}, + } + } + query_parameters.push(("cmdMode".into(), "asynch".into())); + full_url + .query_pairs_mut() + .clear() + .extend_pairs(&query_parameters) + .finish(); + log!(1, "Loading {}", full_url); + let data = ilias.download(full_url.as_str()).await?; + let html = data.text().await?; + let html = Html::parse_fragment(&html); + for row in html.select(&VIDEO_ROWS) { + let link = row.select(&A_TARGET_BLANK).next(); + if link.is_none() { + if !row.text().any(|x| x == NO_ENTRIES) { + warning!(format => "table row without link in {}", url.url); + } + continue; + } + let link = link.unwrap(); + let mut cells = row.select(&TABLE_CELLS); + if let Some(title) = cells.nth(2) { + let title = title.text().collect::(); + let title = title.trim(); + if title.starts_with(" = Lazy::new(|| Selector::parse("a").unwrap()); +static IMAGES: Lazy = Lazy::new(|| Selector::parse("img").unwrap()); +static TABLES: Lazy = Lazy::new(|| Selector::parse("table").unwrap()); +static LINK_IN_TABLE: Lazy = Lazy::new(|| Selector::parse("tbody tr td a").unwrap()); +static POST_ROW: Lazy = Lazy::new(|| Selector::parse(".ilFrmPostRow").unwrap()); +static POST_TITLE: Lazy = Lazy::new(|| Selector::parse(".ilFrmPostTitle").unwrap()); +static POST_CONTAINER: Lazy = Lazy::new(|| Selector::parse(".ilFrmPostContentContainer").unwrap()); +static POST_ATTACHMENTS: Lazy = Lazy::new(|| Selector::parse(".ilFrmPostAttachmentsContainer").unwrap()); +static SPAN_SMALL: Lazy = Lazy::new(|| Selector::parse("span.small").unwrap()); +static IMAGE_SRC_REGEX: Lazy = Lazy::new(|| Regex::new(r#"\./data/produktiv/mobs/mm_(\d+)/([^?]+).+"#).unwrap()); + +pub async fn download(path: &Path, relative_path: &Path, ilias: Arc, url: &URL) -> Result<()> { + if !ilias.opt.forum { + return Ok(()); + } + let mut all_images = Vec::new(); + let mut attachments = Vec::new(); + { + let html = ilias.get_html(&url.url).await?; + for post in html.select(&POST_ROW) { + let title = post + .select(&POST_TITLE) + .next() + .context("post title not found")? + .text() + .collect::(); + let author = post.select(&SPAN_SMALL).next().context("post author not found")?; + let author = author.text().collect::(); + let author = author.trim().split('|').collect::>(); + let author = if author.len() == 2 { + author[0] // pseudonymous forum + } else if author.len() == 3 { + if author[1] != "Pseudonym" { + author[1] + } else { + author[0] + } + } else { + return Err(anyhow!("author data in unknown format")); + } + .trim(); + let container = post + .select(&POST_CONTAINER) + .next() + .context("post container not found")?; + let link = container.select(&LINKS).next().context("post link not found")?; + let id = link.value().attr("id").context("no id in thread link")?.to_owned(); + let name = format!("{}_{}_{}.html", id, author, title.trim()); + let data = container.inner_html(); + let path = path.join(file_escape(&name)); + let relative_path = relative_path.join(file_escape(&name)); + spawn(handle_gracefully(async move { + log!(0, "Writing {}", relative_path.display()); + write_file_data(&path, &mut data.as_bytes()) + .await + .context("failed to write forum post") + })); + let images = container + .select(&IMAGES) + .map(|x| x.value().attr("src").map(|x| x.to_owned())); + for image in images { + let image = image.context("no src on image")?; + all_images.push((id.clone(), image)); + } + if let Some(container) = container.select(&POST_ATTACHMENTS).next() { + for attachment in container.select(&LINKS) { + attachments.push(( + id.clone(), + attachment.text().collect::(), + attachment.value().attr("href").map(|x| x.to_owned()), + )); + } + } + } + // pagination + if let Some(pages) = html.select(&TABLES).next() { + if let Some(last) = pages.select(&LINK_IN_TABLE).last() { + let text = last.text().collect::(); + if text.trim() == ">>" { + // not last page yet + let ilias = Arc::clone(&ilias); + let next_page = Object::Thread { + url: URL::from_href(last.value().attr("href").context("page link not found")?)?, + }; + spawn(process_gracefully(ilias, path.to_owned(), next_page)); + } + } else { + log!( + 0, + "Warning: {} {}", + "unable to find pagination links in".bright_yellow(), + url.url.to_string().bright_yellow() + ); + } + } + } + for (id, image) in all_images { + let src = URL::from_href(&image)?; + let dl = ilias.download(&src.url).await?; + let mut path = path.to_owned(); + if let Some(m) = IMAGE_SRC_REGEX.captures(&image) { + // image uploaded to ILIAS + let (media_id, filename) = (m.get(1).unwrap().as_str(), m.get(2).unwrap().as_str()); + path.push(file_escape(&format!("{}_{}_{}", id, media_id, filename))); + } else { + // external image + path.push(file_escape(&format!("{}_{}", id, image))); + } + spawn(handle_gracefully(async move { + let bytes = dl.bytes().await?; + write_file_data(&path, &mut &*bytes) + .await + .context("failed to write forum post image attachment") + })); + } + for (id, name, url) in attachments { + let url = url.context("attachment without href")?; + let src = URL::from_href(&url)?; + let dl = ilias.download(&src.url).await?; + let mut path = path.to_owned(); + path.push(file_escape(&format!("{}_{}", id, name))); + spawn(handle_gracefully(async move { + let bytes = dl.bytes().await?; + write_file_data(&path, &mut &*bytes) + .await + .context("failed to write forum post file attachment") + })); + } + Ok(()) +} diff --git a/src/ilias/video.rs b/src/ilias/video.rs new file mode 100644 index 0000000..e322544 --- /dev/null +++ b/src/ilias/video.rs @@ -0,0 +1,57 @@ +use std::{path::Path, sync::Arc}; + +use anyhow::{Context, Result}; +use once_cell::sync::Lazy; +use regex::Regex; +use tokio::fs; + +use crate::{util::write_stream_to_file, ILIAS_URL}; + +use super::{ILIAS, URL}; + +static XOCT_REGEX: Lazy = + Lazy::new(|| Regex::new(r#"(?m)"#).unwrap()); + +pub async fn download(path: &Path, relative_path: &Path, ilias: Arc, url: &URL) -> Result<()> { + if ilias.opt.no_videos { + return Ok(()); + } + if fs::metadata(&path).await.is_ok() && !(ilias.opt.force || ilias.opt.check_videos) { + log!(2, "Skipping download, file exists already"); + return Ok(()); + } + let url = format!("{}{}", ILIAS_URL, url.url); + let data = ilias.download(&url); + let html = data.await?.text().await?; + log!(2, "{}", html); + let json: serde_json::Value = { + let mut json_capture = XOCT_REGEX.captures_iter(&html); + let json = &json_capture.next().context("xoct player json not found")?[1]; + log!(2, "{}", json); + let json = json.split(",\n").next().context("invalid xoct player json")?; + serde_json::from_str(&json.trim())? + }; + log!(2, "{}", json); + let url = json + .pointer("/streams/0/sources/mp4/0/src") + .context("video src not found")? + .as_str() + .context("video src not string")?; + let meta = fs::metadata(&path).await; + if !ilias.opt.force && meta.is_ok() && ilias.opt.check_videos { + let head = ilias.head(url).await.context("HEAD request failed")?; + if let Some(len) = head.headers().get("content-length") { + if meta?.len() != len.to_str()?.parse::()? { + warning!( + relative_path.to_string_lossy(), + "was updated, consider moving the outdated file" + ); + } + } + } else { + let resp = ilias.download(&url).await?; + log!(0, "Writing {}", relative_path.to_string_lossy()); + write_stream_to_file(&path, resp.bytes_stream()).await?; + } + Ok(()) +} diff --git a/src/ilias/weblink.rs b/src/ilias/weblink.rs new file mode 100644 index 0000000..72e7763 --- /dev/null +++ b/src/ilias/weblink.rs @@ -0,0 +1,70 @@ +use std::{path::Path, sync::Arc}; + +use anyhow::{Context, Result}; +use once_cell::sync::Lazy; +use scraper::Selector; +use tokio::fs; + +use crate::{ + util::{create_dir, file_escape, write_file_data}, + ILIAS_URL, +}; + +use super::{ILIAS, URL}; + +static LINKS: Lazy = Lazy::new(|| Selector::parse("a").unwrap()); + +pub async fn download(path: &Path, relative_path: &Path, ilias: Arc, url: &URL) -> Result<()> { + if !ilias.opt.force && fs::metadata(&path).await.is_ok() { + log!(2, "Skipping download, link exists already"); + return Ok(()); + } + let head_req_result = ilias.head(&url.url).await; + let url = match &head_req_result { + Err(e) => e.url().context("HEAD request failed")?.as_str(), + Ok(head) => head.url().as_str(), + }; + if url.starts_with(ILIAS_URL) { + // is a link list + if fs::metadata(&path).await.is_err() { + create_dir(&path).await?; + log!(0, "Writing {}", relative_path.to_string_lossy()); + } + + let urls = { + let html = ilias.get_html(url).await?; + html.select(&LINKS) + .filter_map(|x| x.value().attr("href").map(|y| (y, x.text().collect::()))) + .map(|(x, y)| { + URL::from_href(x) + .map(|z| (z, y.trim().to_owned())) + .context("parsing weblink") + }) + .collect::>>() + }?; + + for (url, name) in urls { + if url.cmd.as_deref().unwrap_or("") != "callLink" { + continue; + } + + let head = ilias + .head(url.url.as_str()) + .await + .context("HEAD request to web link failed"); + if let Some(err) = head.as_ref().err() { + warning!(err); + continue; + } + let head = head.unwrap(); + let url = head.url().as_str(); + write_file_data(path.join(file_escape(&name)), &mut url.as_bytes()).await?; + } + } else { + log!(0, "Writing {}", relative_path.to_string_lossy()); + write_file_data(&path, &mut url.as_bytes()) + .await + .context("failed to save weblink URL")?; + } + Ok(()) +} diff --git a/src/main.rs b/src/main.rs index 9b29d9d..197e931 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,26 +1,14 @@ // SPDX-License-Identifier: GPL-3.0-or-later -#![allow(clippy::upper_case_acronyms)] - use anyhow::{anyhow, Context, Result}; -use colored::Colorize; use futures::future::{self, Either}; -use futures_channel::mpsc::UnboundedSender; -use futures_util::stream::TryStreamExt; -use futures_util::StreamExt; +use futures::StreamExt; use ignore::gitignore::Gitignore; use indicatif::{ProgressDrawTarget, ProgressStyle}; -use once_cell::sync::{Lazy, OnceCell}; -use scraper::Html; use structopt::StructOpt; -use tokio::task::{self, JoinHandle}; -use tokio::{fs, sync::Semaphore, time}; -use tokio_util::io::StreamReader; -use url::Url; +use tokio::fs; -use std::collections::HashSet; use std::future::Future; -use std::io; use std::io::BufReader; use std::path::PathBuf; use std::sync::atomic::Ordering; @@ -29,7 +17,8 @@ use std::time::SystemTime; static ILIAS_URL: &str = "https://ilias.studium.kit.edu/"; /// main personal desktop -static DEFAULT_SYNC_URL: &str = "https://ilias.studium.kit.edu/ilias.php?baseClass=ilPersonalDesktopGUI&cmd=jumpToSelectedItems"; +static DEFAULT_SYNC_URL: &str = + "https://ilias.studium.kit.edu/ilias.php?baseClass=ilPersonalDesktopGUI&cmd=jumpToSelectedItems"; #[macro_use] mod cli; @@ -37,35 +26,13 @@ use cli::*; mod ilias; use ilias::*; use Object::*; +mod queue; mod util; use util::*; -/// Global job queue -static TASKS: OnceCell>> = OnceCell::new(); -static TASKS_RUNNING: Lazy = Lazy::new(|| Semaphore::new(0)); -static REQUEST_TICKETS: Lazy = Lazy::new(|| Semaphore::new(0)); - -pub async fn get_request_ticket() { - REQUEST_TICKETS.acquire().await.unwrap().forget(); -} - -macro_rules! spawn { - ($e:expr) => { - TASKS.get().unwrap().unbounded_send(task::spawn($e)).unwrap(); - }; -} - #[tokio::main] async fn main() { let opt = Opt::from_args(); - let rate = opt.rate; - task::spawn(async move { - let mut interval = time::interval(time::Duration::from_secs_f64(60.0 / rate as f64)); - loop { - interval.tick().await; - REQUEST_TICKETS.add_permits(1); - } - }); if let Err(e) = real_main(opt).await { error!(e); } @@ -105,7 +72,7 @@ async fn login(opt: Opt, ignore: Gitignore) -> Result { error!(e) } else { success!("Session still active!"); - return Ok(ilias) + return Ok(ilias); } }, Err(e) => warning!(e), @@ -141,9 +108,13 @@ async fn real_main(mut opt: Opt) -> Result<()> { #[cfg(windows)] let _ = colored::control::set_virtual_terminal(true); - create_dir(&opt.output).await.context("failed to create output directory")?; + create_dir(&opt.output) + .await + .context("failed to create output directory")?; // use UNC paths on Windows (to avoid the default max. path length of 255) - opt.output = fs::canonicalize(opt.output).await.context("failed to canonicalize output directory")?; + opt.output = fs::canonicalize(opt.output) + .await + .context("failed to canonicalize output directory")?; // load .iliasignore file let (ignore, error) = Gitignore::new(opt.output.join(".iliasignore")); @@ -151,6 +122,8 @@ async fn real_main(mut opt: Opt) -> Result<()> { warning!(err); } + queue::set_download_rate(opt.rate); + let ilias = login(opt, ignore).await?; if ilias.opt.content_tree { @@ -162,9 +135,7 @@ async fn real_main(mut opt: Opt) -> Result<()> { } } let ilias = Arc::new(ilias); - let (tx, mut rx) = futures_channel::mpsc::unbounded::>(); - TASKS.get_or_init(|| tx.clone()); - TASKS_RUNNING.add_permits(ilias.opt.jobs); + let mut rx = queue::set_parallel_jobs(ilias.opt.jobs); PROGRESS_BAR_ENABLED.store(atty::is(atty::Stream::Stdout), Ordering::SeqCst); if PROGRESS_BAR_ENABLED.load(Ordering::SeqCst) { PROGRESS_BAR.set_draw_target(ProgressDrawTarget::stderr_nohz()); @@ -173,8 +144,13 @@ async fn real_main(mut opt: Opt) -> Result<()> { } let sync_url = ilias.opt.sync_url.as_deref().unwrap_or(DEFAULT_SYNC_URL); - let obj = Object::from_url(URL::from_href(sync_url).context("invalid sync URL")?, String::new(), None).context("invalid sync object")?; // name can be empty for first element - spawn!(process_gracefully(ilias.clone(), ilias.opt.output.clone(), obj)); + let obj = Object::from_url( + URL::from_href(sync_url).context("invalid sync URL")?, + String::new(), + None, + ) + .context("invalid sync object")?; + queue::spawn(process_gracefully(ilias.clone(), ilias.opt.output.clone(), obj)); while let Either::Left((task, _)) = future::select(rx.next(), future::ready(())).await { if let Some(task) = task { @@ -208,11 +184,11 @@ async fn real_main(mut opt: Opt) -> Result<()> { // https://github.com/rust-lang/rust/issues/53690#issuecomment-418911229 #[allow(clippy::manual_async_fn)] fn process_gracefully(ilias: Arc, path: PathBuf, obj: Object) -> impl Future + Send { + if PROGRESS_BAR_ENABLED.load(Ordering::SeqCst) { + PROGRESS_BAR.inc_length(1); + } async move { - if PROGRESS_BAR_ENABLED.load(Ordering::SeqCst) { - PROGRESS_BAR.inc_length(1); - } - let permit = TASKS_RUNNING.acquire().await.unwrap(); + let permit = queue::get_ticket(); let path_text = path.to_string_lossy().into_owned(); if let Err(e) = process(ilias, path, obj).await.context("failed to process URL") { error!("Syncing {}", path_text; e); @@ -227,46 +203,9 @@ async fn handle_gracefully(fut: impl Future>) { } } -#[allow(non_upper_case_globals)] -mod selectors { - use once_cell::sync::Lazy; - use regex::Regex; - use scraper::Selector; - // construct CSS selectors once - pub static LINKS: Lazy = Lazy::new(|| Selector::parse("a").unwrap()); - pub static a_target_blank: Lazy = Lazy::new(|| Selector::parse(r#"a[target="_blank"]"#).unwrap()); - pub static IMAGES: Lazy = Lazy::new(|| Selector::parse("img").unwrap()); - pub static TABLES: Lazy = Lazy::new(|| Selector::parse("table").unwrap()); - pub static VIDEO_ROWS: Lazy = Lazy::new(|| Selector::parse(".ilTableOuter > div > table > tbody > tr").unwrap()); - pub static links_in_table: Lazy = Lazy::new(|| Selector::parse("tbody tr td a").unwrap()); - pub static th: Lazy = Lazy::new(|| Selector::parse("th").unwrap()); - pub static td: Lazy = Lazy::new(|| Selector::parse("td").unwrap()); - pub static tr: Lazy = Lazy::new(|| Selector::parse("tr").unwrap()); - pub static post_row: Lazy = Lazy::new(|| Selector::parse(".ilFrmPostRow").unwrap()); - pub static post_title: Lazy = Lazy::new(|| Selector::parse(".ilFrmPostTitle").unwrap()); - pub static post_container: Lazy = Lazy::new(|| Selector::parse(".ilFrmPostContentContainer").unwrap()); - pub static post_attachments: Lazy = Lazy::new(|| Selector::parse(".ilFrmPostAttachmentsContainer").unwrap()); - pub static span_small: Lazy = Lazy::new(|| Selector::parse("span.small").unwrap()); - pub static forum_pages: Lazy = Lazy::new(|| Selector::parse("div.ilTableNav > table > tbody > tr > td > a").unwrap()); - pub static alert_danger: Lazy = Lazy::new(|| Selector::parse("div.alert-danger").unwrap()); - pub static form_group: Lazy = Lazy::new(|| Selector::parse(".form-group").unwrap()); - pub static form_name: Lazy = Lazy::new(|| Selector::parse(".il_InfoScreenProperty").unwrap()); - pub static cmd_node_regex: Lazy = Lazy::new(|| Regex::new(r#"cmdNode=uf:\w\w"#).unwrap()); - pub static image_src_regex: Lazy = Lazy::new(|| Regex::new(r#"\./data/produktiv/mobs/mm_(\d+)/([^?]+).+"#).unwrap()); - pub static XOCT_REGEX: Lazy = Lazy::new(|| Regex::new(r#"(?m)"#).unwrap()); - pub static il_content_container: Lazy = Lazy::new(|| Selector::parse("#il_center_col").unwrap()); - pub static item_prop: Lazy = Lazy::new(|| Selector::parse("span.il_ItemProperty").unwrap()); - pub static container_items: Lazy = Lazy::new(|| Selector::parse("div.il_ContainerListItem").unwrap()); - pub static container_item_title: Lazy = Lazy::new(|| Selector::parse("a.il_ContainerItemTitle").unwrap()); -} -use crate::selectors::*; - -const NO_ENTRIES: &str = "Keine Einträge"; - async fn process(ilias: Arc, path: PathBuf, obj: Object) -> Result<()> { let relative_path = path.strip_prefix(&ilias.opt.output).unwrap(); if PROGRESS_BAR_ENABLED.load(Ordering::SeqCst) { - PROGRESS_BAR.inc(1); let path = relative_path.display().to_string(); if !path.is_empty() { PROGRESS_BAR.set_message(path); @@ -284,447 +223,31 @@ async fn process(ilias: Arc, path: PathBuf, obj: Object) -> Result<()> { } match &obj { Course { url, name } => { - let content = if ilias.opt.content_tree { - let html = ilias.download(&url.url).await?.text().await?; - let cmd_node = cmd_node_regex.find(&html).context("can't find cmdNode")?.as_str()[8..].to_owned(); - let content_tree = ilias.get_course_content_tree(&url.ref_id, &cmd_node).await; - match content_tree { - Ok(tree) => (tree.into_iter().map(Result::Ok).collect(), None), - Err(e) => { - // some folders are hidden on the course page and can only be found via the RSS feed / recent activity / content tree sidebar - // TODO: this is probably never the case for folders? - if html.contains(r#"input[name="cmd[join]""#) { - return Ok(()); // ignore groups we are not in - } - warning!(name, "falling back to incomplete course content extractor!", e); - ilias.get_course_content(&url).await? // TODO: perhaps don't download almost the same content 3x - }, - } - } else { - ilias.get_course_content(&url).await? - }; - if let Some(s) = content.1.as_ref() { - let path = path.join("course.html"); - write_file_data(&path, &mut s.as_bytes()) - .await - .context("failed to write course page html")?; - } - for item in content.0 { - let item = item?; - let path = path.join(file_escape(item.name())); - let ilias = Arc::clone(&ilias); - spawn!(process_gracefully(ilias, path, item)); - } + ilias::course::download(path, ilias, url, name).await?; }, Folder { url, .. } | PersonalDesktop { url } => { - let content = ilias.get_course_content(&url).await?; - if let Some(s) = content.1.as_ref() { - let path = path.join("folder.html"); - write_file_data(&path, &mut s.as_bytes()) - .await - .context("failed to write folder page html")?; - } - for item in content.0 { - let item = item?; - let path = path.join(file_escape(item.name())); - let ilias = Arc::clone(&ilias); - spawn!(process_gracefully(ilias, path, item)); - } + ilias::folder::download(&path, ilias, url).await?; }, File { url, .. } => { - if ilias.opt.skip_files { - return Ok(()); - } - if !ilias.opt.force && fs::metadata(&path).await.is_ok() { - log!(2, "Skipping download, file exists already"); - return Ok(()); - } - let data = ilias.download(&url.url).await?; - let mut reader = StreamReader::new(data.bytes_stream().map_err(|x| io::Error::new(io::ErrorKind::Other, x))); - log!(0, "Writing {}", relative_path.to_string_lossy()); - write_file_data(&path, &mut reader).await?; + ilias::file::download(&path, relative_path, ilias, url).await?; }, PluginDispatch { url, .. } => { - if ilias.opt.no_videos { - return Ok(()); - } - let full_url = { - // first find the link to full video list - let list_url = format!("{}ilias.php?ref_id={}&cmdClass=xocteventgui&cmdNode=nc:n4:14u&baseClass=ilObjPluginDispatchGUI&lang=de&limit=20&cmd=asyncGetTableGUI&cmdMode=asynch", ILIAS_URL, url.ref_id); - log!(1, "Loading {}", list_url); - let data = ilias.download(&list_url).await?; - let html = data.text().await?; - let html = Html::parse_fragment(&html); - html.select(&LINKS) - .filter_map(|link| link.value().attr("href")) - .filter(|href| href.contains("trows=800")) - .map(|x| x.to_string()) - .next() - .context("video list link not found")? - }; - log!(1, "Rewriting {}", full_url); - let mut full_url = Url::parse(&format!("{}{}", ILIAS_URL, full_url))?; - let mut query_parameters = full_url.query_pairs().map(|(x, y)| (x.into_owned(), y.into_owned())).collect::>(); - for (key, value) in &mut query_parameters { - match key.as_ref() { - "cmd" => *value = "asyncGetTableGUI".into(), - "cmdClass" => *value = "xocteventgui".into(), - _ => {}, - } - } - query_parameters.push(("cmdMode".into(), "asynch".into())); - full_url.query_pairs_mut().clear().extend_pairs(&query_parameters).finish(); - log!(1, "Loading {}", full_url); - let data = ilias.download(full_url.as_str()).await?; - let html = data.text().await?; - let html = Html::parse_fragment(&html); - for row in html.select(&VIDEO_ROWS) { - let link = row.select(&a_target_blank).next(); - if link.is_none() { - if !row.text().any(|x| x == NO_ENTRIES) { - warning!(format => "table row without link in {}", url.url); - } - continue; - } - let link = link.unwrap(); - let mut cells = row.select(&td); - if let Some(title) = cells.nth(2) { - let title = title.text().collect::(); - let title = title.trim(); - if title.starts_with(" { - if ilias.opt.no_videos { - return Ok(()); - } - if fs::metadata(&path).await.is_ok() && !(ilias.opt.force || ilias.opt.check_videos) { - log!(2, "Skipping download, file exists already"); - return Ok(()); - } - let url = format!("{}{}", ILIAS_URL, url.url); - let data = ilias.download(&url); - let html = data.await?.text().await?; - log!(2, "{}", html); - let json: serde_json::Value = { - let mut json_capture = XOCT_REGEX.captures_iter(&html); - let json = &json_capture.next().context("xoct player json not found")?[1]; - log!(2, "{}", json); - let json = json.split(",\n").next().context("invalid xoct player json")?; - serde_json::from_str(&json.trim())? - }; - log!(2, "{}", json); - let url = json - .pointer("/streams/0/sources/mp4/0/src") - .context("video src not found")? - .as_str() - .context("video src not string")?; - let meta = fs::metadata(&path).await; - if !ilias.opt.force && meta.is_ok() && ilias.opt.check_videos { - let head = ilias.head(url).await.context("HEAD request failed")?; - if let Some(len) = head.headers().get("content-length") { - if meta?.len() != len.to_str()?.parse::()? { - warning!(relative_path.to_string_lossy(), "was updated, consider moving the outdated file"); - } - } - } else { - let resp = ilias.download(&url).await?; - let mut reader = StreamReader::new(resp.bytes_stream().map_err(|x| io::Error::new(io::ErrorKind::Other, x))); - log!(0, "Writing {}", relative_path.to_string_lossy()); - write_file_data(&path, &mut reader).await?; - } + ilias::video::download(&path, relative_path, ilias, url).await?; }, Forum { url, .. } => { - if !ilias.opt.forum { - return Ok(()); - } - let url = &url.url; - let html = { - let data = ilias.download(url); - let html_text = data.await?.text().await?; - let url = { - let html = Html::parse_document(&html_text); - let thread_count_selector = html.select(&LINKS).flat_map(|x| x.value().attr("href")).find(|x| x.contains("trows=800")); - if thread_count_selector.is_none() { - if let Some(cell) = html.select(&td).next() { - if cell.text().any(|x| x == NO_ENTRIES) { - return Ok(()); // empty forum - } - } - } - thread_count_selector - .context("can't find forum thread count selector (empty forum?)")? - .to_owned() - }; - let data = ilias.download(&url); - let html = data.await?.text().await?; - Html::parse_document(&html) - }; - for row in html.select(&tr) { - if row.value().attr("class") == Some("hidden-print") { - continue; // thread count - } - if row.select(&th).next().is_some() { - continue; // table header - } - let cells = row.select(&td).collect::>(); - if cells.len() != 6 { - warning!(format => - "Warning: {}{} {} {}", - "unusual table row (", cells.len(), "cells) in", url.to_string() - ); - continue; - } - let link = cells[1].select(&LINKS).next().context("thread link not found")?; - let object = Object::from_link(link, link)?; - let mut path = path.clone(); - let name = format!( - "{}_{}", - object.url().thr_pk.as_ref().context("thr_pk not found for thread")?, - link.text().collect::().trim() - ); - path.push(file_escape(&name)); - // TODO: set modification date? - let saved_posts = { - match std::fs::read_dir(&path) { - // TODO: make this async - Ok(stream) => stream.count(), - Err(_) => 0, - } - }; - let available_posts = cells[3] - .text() - .next() - .unwrap_or_default() - .trim() - .parse::() - .context("parsing post count failed")?; - if available_posts <= saved_posts && !ilias.opt.force { - continue; - } - let ilias = Arc::clone(&ilias); - spawn!(process_gracefully(ilias, path, object)); - } - if html.select(&forum_pages).count() > 0 { - log!(0, "Ignoring older threads in {:?}..", path); - } + ilias::forum::download(&path, ilias, url).await?; }, Thread { url } => { - if !ilias.opt.forum { - return Ok(()); - } - let mut all_images = Vec::new(); - let mut attachments = Vec::new(); - { - let html = ilias.get_html(&url.url).await?; - for post in html.select(&post_row) { - let title = post - .select(&post_title) - .next() - .context("post title not found")? - .text() - .collect::(); - let author = post.select(&span_small).next().context("post author not found")?; - let author = author.text().collect::(); - let author = author.trim().split('|').collect::>(); - let author = if author.len() == 2 { - author[0] // pseudonymous forum - } else if author.len() == 3 { - if author[1] != "Pseudonym" { - author[1] - } else { - author[0] - } - } else { - return Err(anyhow!("author data in unknown format")); - } - .trim(); - let container = post.select(&post_container).next().context("post container not found")?; - let link = container.select(&LINKS).next().context("post link not found")?; - let id = link.value().attr("id").context("no id in thread link")?.to_owned(); - let name = format!("{}_{}_{}.html", id, author, title.trim()); - let data = container.inner_html(); - let path = path.join(file_escape(&name)); - let relative_path = relative_path.join(file_escape(&name)); - spawn!(handle_gracefully(async move { - log!(0, "Writing {}", relative_path.display()); - write_file_data(&path, &mut data.as_bytes()).await.context("failed to write forum post") - })); - let images = container.select(&IMAGES).map(|x| x.value().attr("src").map(|x| x.to_owned())); - for image in images { - let image = image.context("no src on image")?; - all_images.push((id.clone(), image)); - } - if let Some(container) = container.select(&post_attachments).next() { - for attachment in container.select(&LINKS) { - attachments.push(( - id.clone(), - attachment.text().collect::(), - attachment.value().attr("href").map(|x| x.to_owned()), - )); - } - } - } - // pagination - if let Some(pages) = html.select(&TABLES).next() { - if let Some(last) = pages.select(&links_in_table).last() { - let text = last.text().collect::(); - if text.trim() == ">>" { - // not last page yet - let ilias = Arc::clone(&ilias); - let next_page = Thread { - url: URL::from_href(last.value().attr("href").context("page link not found")?)?, - }; - spawn!(process_gracefully(ilias, path.clone(), next_page)); - } - } else { - log!( - 0, - "Warning: {} {}", - "unable to find pagination links in".bright_yellow(), - url.url.to_string().bright_yellow() - ); - } - } - } - for (id, image) in all_images { - let src = URL::from_href(&image)?; - let dl = ilias.download(&src.url).await?; - let mut path = path.clone(); - if let Some(m) = image_src_regex.captures(&image) { - // image uploaded to ILIAS - let (media_id, filename) = (m.get(1).unwrap().as_str(), m.get(2).unwrap().as_str()); - path.push(file_escape(&format!("{}_{}_{}", id, media_id, filename))); - } else { - // external image - path.push(file_escape(&format!("{}_{}", id, image))); - } - spawn!(handle_gracefully(async move { - let bytes = dl.bytes().await?; - write_file_data(&path, &mut &*bytes) - .await - .context("failed to write forum post image attachment") - })); - } - for (id, name, url) in attachments { - let url = url.context("attachment without href")?; - let src = URL::from_href(&url)?; - let dl = ilias.download(&src.url).await?; - let mut path = path.clone(); - path.push(file_escape(&format!("{}_{}", id, name))); - spawn!(handle_gracefully(async move { - let bytes = dl.bytes().await?; - write_file_data(&path, &mut &*bytes) - .await - .context("failed to write forum post file attachment") - })); - } + ilias::thread::download(&path, relative_path, ilias, url).await?; }, ExerciseHandler { url, .. } => { - let html = ilias.get_html(&url.url).await?; - let mut filenames = HashSet::new(); - for row in html.select(&form_group) { - let link = row.select(&LINKS).next(); - if link.is_none() { - continue; - } - let link = link.unwrap(); - let href = link.value().attr("href"); - if href.is_none() { - continue; - } - let href = href.unwrap(); - let url = URL::from_href(href)?; - let cmd = url.cmd.as_deref().unwrap_or(""); - if cmd != "downloadFile" && cmd != "downloadGlobalFeedbackFile" && cmd != "downloadFeedbackFile" { - continue; - } - // link is definitely just a download link to the exercise or the solution - let name = row - .select(&form_name) - .next() - .context("link without file name")? - .text() - .collect::() - .trim() - .to_owned(); - let item = File { url, name }; - let mut path = path.clone(); - // handle files with the same name - let filename = file_escape(item.name()); - let mut parts = filename.rsplitn(2, '.'); - let extension = parts.next().unwrap_or(&filename); - let name = parts.next().unwrap_or(""); - let mut unique_filename = filename.clone(); - let mut i = 1; - while filenames.contains(&unique_filename) { - i += 1; - if name.is_empty() { - unique_filename = format!("{}{}", extension, i); - } else { - unique_filename = format!("{}{}.{}", name, i, extension); - } - } - filenames.insert(unique_filename.clone()); - path.push(unique_filename); - let ilias = Arc::clone(&ilias); - spawn!(process_gracefully(ilias, path, item)); - } + ilias::exercise::download(&path, ilias, url).await?; }, Weblink { url, .. } => { - if !ilias.opt.force && fs::metadata(&path).await.is_ok() { - log!(2, "Skipping download, link exists already"); - return Ok(()); - } - let head_req_result = ilias.head(&url.url).await; - let url = match &head_req_result { - Err(e) => e.url().context("HEAD request failed")?.as_str(), - Ok(head) => head.url().as_str(), - }; - if url.starts_with(ILIAS_URL) { - // is a link list - if fs::metadata(&path).await.is_err() { - create_dir(&path).await?; - log!(0, "Writing {}", relative_path.to_string_lossy()); - } - - let urls = { - let html = ilias.get_html(url).await?; - html.select(&LINKS) - .filter_map(|x| x.value().attr("href").map(|y| (y, x.text().collect::()))) - .map(|(x, y)| URL::from_href(x).map(|z| (z, y.trim().to_owned())).context("parsing weblink")) - .collect::>>() - }?; - - for (url, name) in urls { - if url.cmd.as_deref().unwrap_or("") != "callLink" { - continue; - } - - let head = ilias.head(url.url.as_str()).await.context("HEAD request to web link failed"); - if let Some(err) = head.as_ref().err() { - warning!(err); - continue; - } - let head = head.unwrap(); - let url = head.url().as_str(); - write_file_data(path.join(file_escape(&name)), &mut url.as_bytes()).await?; - } - } else { - log!(0, "Writing {}", relative_path.to_string_lossy()); - write_file_data(&path, &mut url.as_bytes()).await.context("failed to save weblink URL")?; - } + ilias::weblink::download(&path, relative_path, ilias, url).await?; }, Wiki { .. } => { log!(1, "Ignored wiki!"); @@ -733,11 +256,17 @@ async fn process(ilias: Arc, path: PathBuf, obj: Object) -> Result<()> { log!(1, "Ignored survey!"); }, Presentation { .. } => { - log!(1, "Ignored interactive presentation! (visit it yourself, it's probably interesting)"); + log!( + 1, + "Ignored interactive presentation! (visit it yourself, it's probably interesting)" + ); }, Generic { .. } => { log!(1, "Ignored generic {:?}", obj) }, } + if PROGRESS_BAR_ENABLED.load(Ordering::SeqCst) { + PROGRESS_BAR.inc(1); + } Ok(()) } diff --git a/src/queue.rs b/src/queue.rs new file mode 100644 index 0000000..50cd2ad --- /dev/null +++ b/src/queue.rs @@ -0,0 +1,50 @@ +use futures::Future; +use futures_channel::mpsc::{UnboundedReceiver, UnboundedSender}; +use once_cell::sync::{Lazy, OnceCell}; +use tokio::{ + sync::{Semaphore, SemaphorePermit}, + task::{self, JoinHandle}, + time, +}; + +/// Global job queue +static TASKS: OnceCell>> = OnceCell::new(); +static TASKS_RUNNING: Lazy = Lazy::new(|| Semaphore::new(0)); +static REQUEST_TICKETS: Lazy = Lazy::new(|| Semaphore::new(0)); + +pub async fn get_request_ticket() { + REQUEST_TICKETS.acquire().await.unwrap().forget(); +} + +pub async fn get_ticket() -> SemaphorePermit<'static> { + TASKS_RUNNING.acquire().await.unwrap() +} + +pub fn spawn(e: impl Future + Send + 'static) { + TASKS.get().unwrap().unbounded_send(task::spawn(e)).unwrap(); +} + +pub fn set_download_rate(rate: usize) { + task::spawn(async move { + let mut interval = time::interval(time::Duration::from_secs_f64(60.0 / rate as f64)); + loop { + interval.tick().await; + log!( + 0, + "interval ticked @ {}", + std::time::SystemTime::now() + .duration_since(std::time::SystemTime::UNIX_EPOCH) + .unwrap() + .as_secs() + ); + REQUEST_TICKETS.add_permits(1); + } + }); +} + +pub fn set_parallel_jobs(jobs: usize) -> UnboundedReceiver> { + let (tx, rx) = futures_channel::mpsc::unbounded::>(); + TASKS.get_or_init(|| tx.clone()); + TASKS_RUNNING.add_permits(jobs); + rx +} diff --git a/src/util.rs b/src/util.rs index 17f4082..4554aab 100644 --- a/src/util.rs +++ b/src/util.rs @@ -1,21 +1,38 @@ // SPDX-License-Identifier: GPL-3.0-or-later use anyhow::Context; +use bytes::Bytes; +use futures::TryStreamExt; use tokio::fs::File as AsyncFile; use tokio::io::{AsyncRead, BufWriter}; +use tokio_util::io::StreamReader; +use std::io; use std::path::Path; use crate::Result; +pub async fn write_stream_to_file( + path: &Path, + stream: impl futures::Stream> + Unpin, +) -> Result<()> { + let mut reader = StreamReader::new(stream.map_err(|x| io::Error::new(io::ErrorKind::Other, x))); + write_file_data(&path, &mut reader).await?; + Ok(()) +} + /// Write all data to the specified path. Will overwrite previous file data. pub async fn write_file_data(path: impl AsRef, data: &mut R) -> Result<()> where R: AsyncRead + Unpin, { - let file = AsyncFile::create(path.as_ref()).await.context("failed to create file")?; + let file = AsyncFile::create(path.as_ref()) + .await + .context("failed to create file")?; let mut file = BufWriter::new(file); - tokio::io::copy(data, &mut file).await.context("failed to write to file")?; + tokio::io::copy(data, &mut file) + .await + .context("failed to write to file")?; Ok(()) }