Split downloading logic into modules

This commit is contained in:
FliegendeWurst 2021-06-02 11:13:13 +02:00
parent 0ebe5bc3cf
commit e7354e0ad1
17 changed files with 806 additions and 553 deletions

6
Cargo.lock generated
View File

@ -8,6 +8,7 @@ version = "0.2.21"
dependencies = [
"anyhow",
"atty",
"bytes",
"cfg-if",
"colored",
"cookie_store 0.14.1",
@ -355,11 +356,10 @@ dependencies = [
[[package]]
name = "crossbeam-utils"
version = "0.8.4"
version = "0.8.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4feb231f0d4d6af81aed15928e58ecf5816aa62a2393e2c82f46973e92a9a278"
checksum = "d82cfc11ce7f2c3faef78d8a684447b40d503d9681acebed6cb728d45940c4db"
dependencies = [
"autocfg",
"cfg-if",
"lazy_static",
]

View File

@ -32,6 +32,7 @@ atty = "0.2.14"
h2 = "0.3.3"
cookie_store = "0.14.0"
reqwest_cookie_store = "0.1.5"
bytes = "1.0.1"
[features]
default = []

View File

@ -1,3 +1,3 @@
hard_tabs = true
match_block_trailing_comma = true
max_width = 145
max_width = 120

View File

@ -6,8 +6,6 @@ use std::sync::atomic::{AtomicBool, AtomicUsize};
#[cfg(feature = "keyring-auth")]
use anyhow::anyhow;
use anyhow::{Context, Result};
#[cfg(feature = "keyring-auth")]
use colored::Colorize as _;
use indicatif::ProgressBar;
use once_cell::sync::Lazy;
use structopt::StructOpt;
@ -87,6 +85,8 @@ pub static PROGRESS_BAR: Lazy<ProgressBar> = Lazy::new(|| ProgressBar::new(0));
macro_rules! log {
($lvl:expr, $($t:expr),+) => {{
#[allow(unused_imports)]
use colored::Colorize as _;
#[allow(unused_comparisons)] // 0 <= 0
if $lvl <= crate::cli::LOG_LEVEL.load(std::sync::atomic::Ordering::SeqCst) {
if crate::cli::PROGRESS_BAR_ENABLED.load(std::sync::atomic::Ordering::SeqCst) {
@ -111,21 +111,21 @@ macro_rules! success {
}
macro_rules! warning {
($e:expr) => {
($e:expr) => {{
log!(0, "Warning: {}", format!("{:?}", $e).bright_yellow());
};
($msg:expr, $e:expr) => {
}};
($msg:expr, $e:expr) => {{
log!(0, "Warning: {}", format!("{} {:?}", $msg, $e).bright_yellow());
};
($msg1:expr, $msg2:expr, $e:expr) => {
}};
($msg1:expr, $msg2:expr, $e:expr) => {{
log!(0, "Warning: {}", format!("{} {} {:?}", $msg1, $msg2, $e).bright_yellow());
};
(format => $($e:expr),+) => {
}};
(format => $($e:expr),+) => {{
log!(0, "Warning: {}", format!($($e),+).bright_yellow());
};
($lvl:expr; $($e:expr),+) => {
}};
($lvl:expr; $($e:expr),+) => {{
log!($lvl, "Warning: {}", format!($($e),+).bright_yellow());
}
}};
}
macro_rules! error {

View File

@ -3,15 +3,32 @@
use std::{error::Error as _, io::Write, sync::Arc};
use anyhow::{anyhow, Context, Result};
use colored::Colorize;
use cookie_store::CookieStore;
use ignore::gitignore::Gitignore;
use once_cell::sync::Lazy;
use reqwest::{Client, IntoUrl, Proxy, Url};
use reqwest_cookie_store::CookieStoreMutex;
use scraper::{ElementRef, Html, Selector};
use serde_json::json;
use crate::{cli::Opt, get_request_ticket, selectors::*, ILIAS_URL};
use crate::{cli::Opt, queue, ILIAS_URL};
pub mod course;
pub mod exercise;
pub mod file;
pub mod folder;
pub mod forum;
pub mod plugin_dispatch;
pub mod thread;
pub mod video;
pub mod weblink;
static LINKS: Lazy<Selector> = Lazy::new(|| Selector::parse("a").unwrap());
static ALERT_DANGER: Lazy<Selector> = Lazy::new(|| Selector::parse("div.alert-danger").unwrap());
static IL_CONTENT_CONTAINER: Lazy<Selector> = Lazy::new(|| Selector::parse("#il_center_col").unwrap());
static ITEM_PROP: Lazy<Selector> = Lazy::new(|| Selector::parse("span.il_ItemProperty").unwrap());
static CONTAINER_ITEMS: Lazy<Selector> = Lazy::new(|| Selector::parse("div.il_ContainerListItem").unwrap());
static CONTAINER_ITEM_TITLE: Lazy<Selector> = Lazy::new(|| Selector::parse("a.il_ContainerItemTitle").unwrap());
pub struct ILIAS {
pub opt: Opt,
@ -38,8 +55,7 @@ fn error_is_http2(error: &reqwest::Error) -> bool {
impl ILIAS {
// TODO: de-duplicate the logic below
pub async fn with_session(opt: Opt, session: Arc<CookieStoreMutex>, ignore: Gitignore) -> Result<Self> {
let mut builder =
Client::builder()
let mut builder = Client::builder()
.cookie_provider(Arc::clone(&session))
.user_agent(concat!(env!("CARGO_PKG_NAME"), "/", env!("CARGO_PKG_VERSION")));
if let Some(proxy) = opt.proxy.as_ref() {
@ -62,11 +78,9 @@ impl ILIAS {
let cookie_store = CookieStore::default();
let cookie_store = reqwest_cookie_store::CookieStoreMutex::new(cookie_store);
let cookie_store = std::sync::Arc::new(cookie_store);
let mut builder = Client::builder().cookie_provider(Arc::clone(&cookie_store)).user_agent(concat!(
env!("CARGO_PKG_NAME"),
"/",
env!("CARGO_PKG_VERSION")
));
let mut builder = Client::builder()
.cookie_provider(Arc::clone(&cookie_store))
.user_agent(concat!(env!("CARGO_PKG_NAME"), "/", env!("CARGO_PKG_VERSION")));
if let Some(proxy) = opt.proxy.as_ref() {
let proxy = Proxy::all(proxy)?;
builder = builder.proxy(proxy);
@ -118,7 +132,10 @@ impl ILIAS {
.await?;
let dom = Html::parse_document(&login_response);
let saml = Selector::parse(r#"input[name="SAMLResponse"]"#).unwrap();
let saml = dom.select(&saml).next().context("no SAML response, incorrect password?")?;
let saml = dom
.select(&saml)
.next()
.context("no SAML response, incorrect password?")?;
let relay_state = Selector::parse(r#"input[name="RelayState"]"#).unwrap();
let relay_state = dom.select(&relay_state).next().context("no relay state")?;
info!("Logging into ILIAS..");
@ -136,7 +153,9 @@ impl ILIAS {
pub async fn save_session(&self) -> Result<()> {
let session_path = self.opt.output.join(".iliassession");
let mut writer = std::fs::File::create(session_path).map(std::io::BufWriter::new).unwrap();
let mut writer = std::fs::File::create(session_path)
.map(std::io::BufWriter::new)
.unwrap();
let store = self.cookies.lock().map_err(|x| anyhow!("{}", x))?;
// save all cookies, including session cookies
for cookie in store.iter_unexpired().map(serde_json::to_string) {
@ -147,7 +166,7 @@ impl ILIAS {
}
pub async fn download(&self, url: &str) -> Result<reqwest::Response> {
get_request_ticket().await;
queue::get_request_ticket().await;
log!(2, "Downloading {}", url);
let url = if url.starts_with("http://") || url.starts_with("https://") {
url.to_owned()
@ -171,7 +190,7 @@ impl ILIAS {
}
pub async fn head<U: IntoUrl>(&self, url: U) -> Result<reqwest::Response, reqwest::Error> {
get_request_ticket().await;
queue::get_request_ticket().await;
let url = url.into_url()?;
for attempt in 1..10 {
let result = self.client.head(url.clone()).send().await;
@ -199,7 +218,7 @@ impl ILIAS {
}
let text = self.download(url).await?.text().await?;
let html = Html::parse_document(&text);
if html.select(&alert_danger).next().is_some() {
if html.select(&ALERT_DANGER).next().is_some() {
Err(anyhow!("ILIAS error"))
} else {
Ok(html)
@ -209,7 +228,7 @@ impl ILIAS {
pub async fn get_html_fragment(&self, url: &str) -> Result<Html> {
let text = self.download(url).await?.text().await?;
let html = Html::parse_fragment(&text);
if html.select(&alert_danger).next().is_some() {
if html.select(&ALERT_DANGER).next().is_some() {
Err(anyhow!("ILIAS error"))
} else {
Ok(html)
@ -217,9 +236,11 @@ impl ILIAS {
}
pub fn get_items(html: &Html) -> Vec<Result<Object>> {
html.select(&container_items)
html.select(&CONTAINER_ITEMS)
.flat_map(|item| {
item.select(&container_item_title).next().map(|link| Object::from_link(item, link))
item.select(&CONTAINER_ITEM_TITLE)
.next()
.map(|link| Object::from_link(item, link))
// items without links are ignored
})
.collect()
@ -229,7 +250,7 @@ impl ILIAS {
pub async fn get_course_content(&self, url: &URL) -> Result<(Vec<Result<Object>>, Option<String>)> {
let html = self.get_html(&url.url).await?;
let main_text = if let Some(el) = html.select(&il_content_container).next() {
let main_text = if let Some(el) = html.select(&IL_CONTENT_CONTAINER).next() {
if !el
.children()
.flat_map(|x| x.value().as_element())
@ -268,6 +289,10 @@ impl ILIAS {
}
}
trait IliasObject {
fn download(ilias: Arc<ILIAS>) -> Result<()>;
}
#[derive(Debug)]
pub enum Object {
Course { name: String, url: URL },
@ -405,9 +430,15 @@ impl Object {
// download page containing metadata
return Ok(Generic { name, url });
} else {
let mut item_props = item.context("can't construct file object without HTML object")?.select(&item_prop);
let mut item_props = item
.context("can't construct file object without HTML object")?
.select(&ITEM_PROP);
let ext = item_props.next().context("cannot find file extension")?;
let version = item_props.nth(1).context("cannot find 3rd file metadata")?.text().collect::<String>();
let version = item_props
.nth(1)
.context("cannot find 3rd file metadata")?
.text()
.collect::<String>();
let version = version.trim();
if let Some(v) = version.strip_prefix("Version: ") {
name += "_v";

50
src/ilias/course.rs Normal file
View File

@ -0,0 +1,50 @@
use std::{path::PathBuf, sync::Arc};
use anyhow::{Context, Result};
use once_cell::sync::Lazy;
use regex::Regex;
use crate::{
process_gracefully,
queue::spawn,
util::{file_escape, write_file_data},
};
use super::{ILIAS, URL};
static CMD_NODE_REGEX: Lazy<Regex> = Lazy::new(|| Regex::new(r#"cmdNode=uf:\w\w"#).unwrap());
pub async fn download(path: PathBuf, ilias: Arc<ILIAS>, url: &URL, name: &str) -> Result<()> {
let content = if ilias.opt.content_tree {
let html = ilias.download(&url.url).await?.text().await?;
let cmd_node = CMD_NODE_REGEX.find(&html).context("can't find cmdNode")?.as_str()[8..].to_owned();
let content_tree = ilias.get_course_content_tree(&url.ref_id, &cmd_node).await;
match content_tree {
Ok(tree) => (tree.into_iter().map(Result::Ok).collect(), None),
Err(e) => {
// some folders are hidden on the course page and can only be found via the RSS feed / recent activity / content tree sidebar
// TODO: this is probably never the case for folders?
if html.contains(r#"input[name="cmd[join]""#) {
return Ok(()); // ignore groups we are not in
}
warning!(name, "falling back to incomplete course content extractor!", e);
ilias.get_course_content(&url).await? // TODO: perhaps don't download almost the same content 3x
},
}
} else {
ilias.get_course_content(&url).await?
};
if let Some(s) = content.1.as_ref() {
let path = path.join("course.html");
write_file_data(&path, &mut s.as_bytes())
.await
.context("failed to write course page html")?;
}
for item in content.0 {
let item = item?;
let path = path.join(file_escape(item.name()));
let ilias = Arc::clone(&ilias);
spawn(process_gracefully(ilias, path, item));
}
Ok(())
}

66
src/ilias/exercise.rs Normal file
View File

@ -0,0 +1,66 @@
use std::{collections::HashSet, path::Path, sync::Arc};
use anyhow::{Context, Result};
use once_cell::sync::Lazy;
use scraper::Selector;
use crate::{process_gracefully, queue::spawn, util::file_escape};
use super::{Object, ILIAS, URL};
static LINKS: Lazy<Selector> = Lazy::new(|| Selector::parse("a").unwrap());
static FORM_GROUP: Lazy<Selector> = Lazy::new(|| Selector::parse(".form-group").unwrap());
static FORM_NAME: Lazy<Selector> = Lazy::new(|| Selector::parse(".il_InfoScreenProperty").unwrap());
pub async fn download(path: &Path, ilias: Arc<ILIAS>, url: &URL) -> Result<()> {
let html = ilias.get_html(&url.url).await?;
let mut filenames = HashSet::new();
for row in html.select(&FORM_GROUP) {
let link = row.select(&LINKS).next();
if link.is_none() {
continue;
}
let link = link.unwrap();
let href = link.value().attr("href");
if href.is_none() {
continue;
}
let href = href.unwrap();
let url = URL::from_href(href)?;
let cmd = url.cmd.as_deref().unwrap_or("");
if cmd != "downloadFile" && cmd != "downloadGlobalFeedbackFile" && cmd != "downloadFeedbackFile" {
continue;
}
// link is definitely just a download link to the exercise or the solution
let name = row
.select(&FORM_NAME)
.next()
.context("link without file name")?
.text()
.collect::<String>()
.trim()
.to_owned();
let item = Object::File { url, name };
let mut path = path.to_owned();
// handle files with the same name
let filename = file_escape(item.name());
let mut parts = filename.rsplitn(2, '.');
let extension = parts.next().unwrap_or(&filename);
let name = parts.next().unwrap_or("");
let mut unique_filename = filename.clone();
let mut i = 1;
while filenames.contains(&unique_filename) {
i += 1;
if name.is_empty() {
unique_filename = format!("{}{}", extension, i);
} else {
unique_filename = format!("{}{}.{}", name, i, extension);
}
}
filenames.insert(unique_filename.clone());
path.push(unique_filename);
let ilias = Arc::clone(&ilias);
spawn(process_gracefully(ilias, path, item));
}
Ok(())
}

22
src/ilias/file.rs Normal file
View File

@ -0,0 +1,22 @@
use std::{path::Path, sync::Arc};
use anyhow::Result;
use tokio::fs;
use crate::util::write_stream_to_file;
use super::{ILIAS, URL};
pub async fn download(path: &Path, relative_path: &Path, ilias: Arc<ILIAS>, url: &URL) -> Result<()> {
if ilias.opt.skip_files {
return Ok(());
}
if !ilias.opt.force && fs::metadata(&path).await.is_ok() {
log!(2, "Skipping download, file exists already");
return Ok(());
}
let data = ilias.download(&url.url).await?;
log!(0, "Writing {}", relative_path.to_string_lossy());
write_stream_to_file(&path, data.bytes_stream()).await?;
Ok(())
}

28
src/ilias/folder.rs Normal file
View File

@ -0,0 +1,28 @@
use std::{path::Path, sync::Arc};
use anyhow::{Context, Result};
use crate::{
process_gracefully,
queue::spawn,
util::{file_escape, write_file_data},
};
use super::{ILIAS, URL};
pub async fn download(path: &Path, ilias: Arc<ILIAS>, url: &URL) -> Result<()> {
let content = ilias.get_course_content(&url).await?;
if let Some(s) = content.1.as_ref() {
let path = path.join("folder.html");
write_file_data(&path, &mut s.as_bytes())
.await
.context("failed to write folder page html")?;
}
for item in content.0 {
let item = item?;
let path = path.join(file_escape(item.name()));
let ilias = Arc::clone(&ilias);
spawn(process_gracefully(ilias, path, item));
}
Ok(())
}

100
src/ilias/forum.rs Normal file
View File

@ -0,0 +1,100 @@
use std::{path::Path, sync::Arc};
use anyhow::{Context, Result};
use once_cell::sync::Lazy;
use scraper::{Html, Selector};
use crate::{ilias::Object, process_gracefully, queue::spawn, util::file_escape};
use super::{ILIAS, URL};
static LINKS: Lazy<Selector> = Lazy::new(|| Selector::parse("a").unwrap());
static TABLE_HEADER: Lazy<Selector> = Lazy::new(|| Selector::parse("th").unwrap());
static TABLE_ROW: Lazy<Selector> = Lazy::new(|| Selector::parse("tr").unwrap());
static TABLE_CELLS: Lazy<Selector> = Lazy::new(|| Selector::parse("td").unwrap());
static FORUM_PAGES: Lazy<Selector> =
Lazy::new(|| Selector::parse("div.ilTableNav > table > tbody > tr > td > a").unwrap());
const NO_ENTRIES: &str = "Keine Einträge";
pub async fn download(path: &Path, ilias: Arc<ILIAS>, url: &URL) -> Result<()> {
if !ilias.opt.forum {
return Ok(());
}
let url = &url.url;
let html = {
let data = ilias.download(url);
let html_text = data.await?.text().await?;
let url = {
let html = Html::parse_document(&html_text);
let thread_count_selector = html
.select(&LINKS)
.flat_map(|x| x.value().attr("href"))
.find(|x| x.contains("trows=800"));
if thread_count_selector.is_none() {
if let Some(cell) = html.select(&TABLE_CELLS).next() {
if cell.text().any(|x| x == NO_ENTRIES) {
return Ok(()); // empty forum
}
}
}
thread_count_selector
.context("can't find forum thread count selector (empty forum?)")?
.to_owned()
};
let data = ilias.download(&url);
let html = data.await?.text().await?;
Html::parse_document(&html)
};
for row in html.select(&TABLE_ROW) {
if row.value().attr("class") == Some("hidden-print") {
continue; // thread count
}
if row.select(&TABLE_HEADER).next().is_some() {
continue;
}
let cells = row.select(&TABLE_CELLS).collect::<Vec<_>>();
if cells.len() != 6 {
warning!(format =>
"Warning: {}{} {} {}",
"unusual table row (", cells.len(), "cells) in", url.to_string()
);
continue;
}
let link = cells[1].select(&LINKS).next().context("thread link not found")?;
let object = Object::from_link(link, link)?;
let mut path = path.to_owned();
let name = format!(
"{}_{}",
object.url().thr_pk.as_ref().context("thr_pk not found for thread")?,
link.text().collect::<String>().trim()
);
path.push(file_escape(&name));
// FIXME: this heuristic no longer works after downloading attachments
// TODO: set modification date?
let saved_posts = {
match std::fs::read_dir(&path) {
// TODO: make this async
Ok(stream) => stream.count(),
Err(_) => 0,
}
};
let available_posts = cells[3]
.text()
.next()
.unwrap_or_default()
.trim()
.parse::<usize>()
.context("parsing post count failed")?;
if available_posts <= saved_posts && !ilias.opt.force {
continue;
}
let ilias = Arc::clone(&ilias);
spawn(process_gracefully(ilias, path, object));
}
if html.select(&FORUM_PAGES).count() > 0 {
log!(0, "Ignoring older threads in {:?}..", path);
}
Ok(())
}

View File

@ -0,0 +1,87 @@
use std::{path::Path, sync::Arc};
use anyhow::{Context, Result};
use once_cell::sync::Lazy;
use reqwest::Url;
use scraper::{Html, Selector};
use crate::{ilias::Object, process_gracefully, queue::spawn, util::file_escape, ILIAS_URL};
use super::{ILIAS, URL};
static LINKS: Lazy<Selector> = Lazy::new(|| Selector::parse("a").unwrap());
static A_TARGET_BLANK: Lazy<Selector> = Lazy::new(|| Selector::parse(r#"a[target="_blank"]"#).unwrap());
static VIDEO_ROWS: Lazy<Selector> = Lazy::new(|| Selector::parse(".ilTableOuter > div > table > tbody > tr").unwrap());
static TABLE_CELLS: Lazy<Selector> = Lazy::new(|| Selector::parse("td").unwrap());
const NO_ENTRIES: &str = "Keine Einträge";
pub async fn download(path: &Path, ilias: Arc<ILIAS>, url: &URL) -> Result<()> {
if ilias.opt.no_videos {
return Ok(());
}
let full_url = {
// first find the link to full video list
let list_url = format!("{}ilias.php?ref_id={}&cmdClass=xocteventgui&cmdNode=nc:n4:14u&baseClass=ilObjPluginDispatchGUI&lang=de&limit=20&cmd=asyncGetTableGUI&cmdMode=asynch", ILIAS_URL, url.ref_id);
log!(1, "Loading {}", list_url);
let data = ilias.download(&list_url).await?;
let html = data.text().await?;
let html = Html::parse_fragment(&html);
html.select(&LINKS)
.filter_map(|link| link.value().attr("href"))
.filter(|href| href.contains("trows=800"))
.map(|x| x.to_string())
.next()
.context("video list link not found")?
};
log!(1, "Rewriting {}", full_url);
let mut full_url = Url::parse(&format!("{}{}", ILIAS_URL, full_url))?;
let mut query_parameters = full_url
.query_pairs()
.map(|(x, y)| (x.into_owned(), y.into_owned()))
.collect::<Vec<_>>();
for (key, value) in &mut query_parameters {
match key.as_ref() {
"cmd" => *value = "asyncGetTableGUI".into(),
"cmdClass" => *value = "xocteventgui".into(),
_ => {},
}
}
query_parameters.push(("cmdMode".into(), "asynch".into()));
full_url
.query_pairs_mut()
.clear()
.extend_pairs(&query_parameters)
.finish();
log!(1, "Loading {}", full_url);
let data = ilias.download(full_url.as_str()).await?;
let html = data.text().await?;
let html = Html::parse_fragment(&html);
for row in html.select(&VIDEO_ROWS) {
let link = row.select(&A_TARGET_BLANK).next();
if link.is_none() {
if !row.text().any(|x| x == NO_ENTRIES) {
warning!(format => "table row without link in {}", url.url);
}
continue;
}
let link = link.unwrap();
let mut cells = row.select(&TABLE_CELLS);
if let Some(title) = cells.nth(2) {
let title = title.text().collect::<String>();
let title = title.trim();
if title.starts_with("<div") {
continue;
}
let mut path = path.to_owned();
path.push(format!("{}.mp4", file_escape(title)));
log!(1, "Found video: {}", title);
let video = Object::Video {
url: URL::raw(link.value().attr("href").context("video link without href")?.to_owned()),
};
let ilias = Arc::clone(&ilias);
spawn(process_gracefully(ilias, path, video));
}
}
Ok(())
}

145
src/ilias/thread.rs Normal file
View File

@ -0,0 +1,145 @@
use std::{path::Path, sync::Arc};
use anyhow::{anyhow, Context, Result};
use once_cell::sync::Lazy;
use regex::Regex;
use scraper::Selector;
use crate::{
handle_gracefully, process_gracefully,
queue::spawn,
util::{file_escape, write_file_data},
};
use super::{Object, ILIAS, URL};
static LINKS: Lazy<Selector> = Lazy::new(|| Selector::parse("a").unwrap());
static IMAGES: Lazy<Selector> = Lazy::new(|| Selector::parse("img").unwrap());
static TABLES: Lazy<Selector> = Lazy::new(|| Selector::parse("table").unwrap());
static LINK_IN_TABLE: Lazy<Selector> = Lazy::new(|| Selector::parse("tbody tr td a").unwrap());
static POST_ROW: Lazy<Selector> = Lazy::new(|| Selector::parse(".ilFrmPostRow").unwrap());
static POST_TITLE: Lazy<Selector> = Lazy::new(|| Selector::parse(".ilFrmPostTitle").unwrap());
static POST_CONTAINER: Lazy<Selector> = Lazy::new(|| Selector::parse(".ilFrmPostContentContainer").unwrap());
static POST_ATTACHMENTS: Lazy<Selector> = Lazy::new(|| Selector::parse(".ilFrmPostAttachmentsContainer").unwrap());
static SPAN_SMALL: Lazy<Selector> = Lazy::new(|| Selector::parse("span.small").unwrap());
static IMAGE_SRC_REGEX: Lazy<Regex> = Lazy::new(|| Regex::new(r#"\./data/produktiv/mobs/mm_(\d+)/([^?]+).+"#).unwrap());
pub async fn download(path: &Path, relative_path: &Path, ilias: Arc<ILIAS>, url: &URL) -> Result<()> {
if !ilias.opt.forum {
return Ok(());
}
let mut all_images = Vec::new();
let mut attachments = Vec::new();
{
let html = ilias.get_html(&url.url).await?;
for post in html.select(&POST_ROW) {
let title = post
.select(&POST_TITLE)
.next()
.context("post title not found")?
.text()
.collect::<String>();
let author = post.select(&SPAN_SMALL).next().context("post author not found")?;
let author = author.text().collect::<String>();
let author = author.trim().split('|').collect::<Vec<_>>();
let author = if author.len() == 2 {
author[0] // pseudonymous forum
} else if author.len() == 3 {
if author[1] != "Pseudonym" {
author[1]
} else {
author[0]
}
} else {
return Err(anyhow!("author data in unknown format"));
}
.trim();
let container = post
.select(&POST_CONTAINER)
.next()
.context("post container not found")?;
let link = container.select(&LINKS).next().context("post link not found")?;
let id = link.value().attr("id").context("no id in thread link")?.to_owned();
let name = format!("{}_{}_{}.html", id, author, title.trim());
let data = container.inner_html();
let path = path.join(file_escape(&name));
let relative_path = relative_path.join(file_escape(&name));
spawn(handle_gracefully(async move {
log!(0, "Writing {}", relative_path.display());
write_file_data(&path, &mut data.as_bytes())
.await
.context("failed to write forum post")
}));
let images = container
.select(&IMAGES)
.map(|x| x.value().attr("src").map(|x| x.to_owned()));
for image in images {
let image = image.context("no src on image")?;
all_images.push((id.clone(), image));
}
if let Some(container) = container.select(&POST_ATTACHMENTS).next() {
for attachment in container.select(&LINKS) {
attachments.push((
id.clone(),
attachment.text().collect::<String>(),
attachment.value().attr("href").map(|x| x.to_owned()),
));
}
}
}
// pagination
if let Some(pages) = html.select(&TABLES).next() {
if let Some(last) = pages.select(&LINK_IN_TABLE).last() {
let text = last.text().collect::<String>();
if text.trim() == ">>" {
// not last page yet
let ilias = Arc::clone(&ilias);
let next_page = Object::Thread {
url: URL::from_href(last.value().attr("href").context("page link not found")?)?,
};
spawn(process_gracefully(ilias, path.to_owned(), next_page));
}
} else {
log!(
0,
"Warning: {} {}",
"unable to find pagination links in".bright_yellow(),
url.url.to_string().bright_yellow()
);
}
}
}
for (id, image) in all_images {
let src = URL::from_href(&image)?;
let dl = ilias.download(&src.url).await?;
let mut path = path.to_owned();
if let Some(m) = IMAGE_SRC_REGEX.captures(&image) {
// image uploaded to ILIAS
let (media_id, filename) = (m.get(1).unwrap().as_str(), m.get(2).unwrap().as_str());
path.push(file_escape(&format!("{}_{}_{}", id, media_id, filename)));
} else {
// external image
path.push(file_escape(&format!("{}_{}", id, image)));
}
spawn(handle_gracefully(async move {
let bytes = dl.bytes().await?;
write_file_data(&path, &mut &*bytes)
.await
.context("failed to write forum post image attachment")
}));
}
for (id, name, url) in attachments {
let url = url.context("attachment without href")?;
let src = URL::from_href(&url)?;
let dl = ilias.download(&src.url).await?;
let mut path = path.to_owned();
path.push(file_escape(&format!("{}_{}", id, name)));
spawn(handle_gracefully(async move {
let bytes = dl.bytes().await?;
write_file_data(&path, &mut &*bytes)
.await
.context("failed to write forum post file attachment")
}));
}
Ok(())
}

57
src/ilias/video.rs Normal file
View File

@ -0,0 +1,57 @@
use std::{path::Path, sync::Arc};
use anyhow::{Context, Result};
use once_cell::sync::Lazy;
use regex::Regex;
use tokio::fs;
use crate::{util::write_stream_to_file, ILIAS_URL};
use super::{ILIAS, URL};
static XOCT_REGEX: Lazy<Regex> =
Lazy::new(|| Regex::new(r#"(?m)<script>\s+xoctPaellaPlayer\.init\(([\s\S]+)\)\s+</script>"#).unwrap());
pub async fn download(path: &Path, relative_path: &Path, ilias: Arc<ILIAS>, url: &URL) -> Result<()> {
if ilias.opt.no_videos {
return Ok(());
}
if fs::metadata(&path).await.is_ok() && !(ilias.opt.force || ilias.opt.check_videos) {
log!(2, "Skipping download, file exists already");
return Ok(());
}
let url = format!("{}{}", ILIAS_URL, url.url);
let data = ilias.download(&url);
let html = data.await?.text().await?;
log!(2, "{}", html);
let json: serde_json::Value = {
let mut json_capture = XOCT_REGEX.captures_iter(&html);
let json = &json_capture.next().context("xoct player json not found")?[1];
log!(2, "{}", json);
let json = json.split(",\n").next().context("invalid xoct player json")?;
serde_json::from_str(&json.trim())?
};
log!(2, "{}", json);
let url = json
.pointer("/streams/0/sources/mp4/0/src")
.context("video src not found")?
.as_str()
.context("video src not string")?;
let meta = fs::metadata(&path).await;
if !ilias.opt.force && meta.is_ok() && ilias.opt.check_videos {
let head = ilias.head(url).await.context("HEAD request failed")?;
if let Some(len) = head.headers().get("content-length") {
if meta?.len() != len.to_str()?.parse::<u64>()? {
warning!(
relative_path.to_string_lossy(),
"was updated, consider moving the outdated file"
);
}
}
} else {
let resp = ilias.download(&url).await?;
log!(0, "Writing {}", relative_path.to_string_lossy());
write_stream_to_file(&path, resp.bytes_stream()).await?;
}
Ok(())
}

70
src/ilias/weblink.rs Normal file
View File

@ -0,0 +1,70 @@
use std::{path::Path, sync::Arc};
use anyhow::{Context, Result};
use once_cell::sync::Lazy;
use scraper::Selector;
use tokio::fs;
use crate::{
util::{create_dir, file_escape, write_file_data},
ILIAS_URL,
};
use super::{ILIAS, URL};
static LINKS: Lazy<Selector> = Lazy::new(|| Selector::parse("a").unwrap());
pub async fn download(path: &Path, relative_path: &Path, ilias: Arc<ILIAS>, url: &URL) -> Result<()> {
if !ilias.opt.force && fs::metadata(&path).await.is_ok() {
log!(2, "Skipping download, link exists already");
return Ok(());
}
let head_req_result = ilias.head(&url.url).await;
let url = match &head_req_result {
Err(e) => e.url().context("HEAD request failed")?.as_str(),
Ok(head) => head.url().as_str(),
};
if url.starts_with(ILIAS_URL) {
// is a link list
if fs::metadata(&path).await.is_err() {
create_dir(&path).await?;
log!(0, "Writing {}", relative_path.to_string_lossy());
}
let urls = {
let html = ilias.get_html(url).await?;
html.select(&LINKS)
.filter_map(|x| x.value().attr("href").map(|y| (y, x.text().collect::<String>())))
.map(|(x, y)| {
URL::from_href(x)
.map(|z| (z, y.trim().to_owned()))
.context("parsing weblink")
})
.collect::<Result<Vec<_>>>()
}?;
for (url, name) in urls {
if url.cmd.as_deref().unwrap_or("") != "callLink" {
continue;
}
let head = ilias
.head(url.url.as_str())
.await
.context("HEAD request to web link failed");
if let Some(err) = head.as_ref().err() {
warning!(err);
continue;
}
let head = head.unwrap();
let url = head.url().as_str();
write_file_data(path.join(file_escape(&name)), &mut url.as_bytes()).await?;
}
} else {
log!(0, "Writing {}", relative_path.to_string_lossy());
write_file_data(&path, &mut url.as_bytes())
.await
.context("failed to save weblink URL")?;
}
Ok(())
}

View File

@ -1,26 +1,14 @@
// SPDX-License-Identifier: GPL-3.0-or-later
#![allow(clippy::upper_case_acronyms)]
use anyhow::{anyhow, Context, Result};
use colored::Colorize;
use futures::future::{self, Either};
use futures_channel::mpsc::UnboundedSender;
use futures_util::stream::TryStreamExt;
use futures_util::StreamExt;
use futures::StreamExt;
use ignore::gitignore::Gitignore;
use indicatif::{ProgressDrawTarget, ProgressStyle};
use once_cell::sync::{Lazy, OnceCell};
use scraper::Html;
use structopt::StructOpt;
use tokio::task::{self, JoinHandle};
use tokio::{fs, sync::Semaphore, time};
use tokio_util::io::StreamReader;
use url::Url;
use tokio::fs;
use std::collections::HashSet;
use std::future::Future;
use std::io;
use std::io::BufReader;
use std::path::PathBuf;
use std::sync::atomic::Ordering;
@ -29,7 +17,8 @@ use std::time::SystemTime;
static ILIAS_URL: &str = "https://ilias.studium.kit.edu/";
/// main personal desktop
static DEFAULT_SYNC_URL: &str = "https://ilias.studium.kit.edu/ilias.php?baseClass=ilPersonalDesktopGUI&cmd=jumpToSelectedItems";
static DEFAULT_SYNC_URL: &str =
"https://ilias.studium.kit.edu/ilias.php?baseClass=ilPersonalDesktopGUI&cmd=jumpToSelectedItems";
#[macro_use]
mod cli;
@ -37,35 +26,13 @@ use cli::*;
mod ilias;
use ilias::*;
use Object::*;
mod queue;
mod util;
use util::*;
/// Global job queue
static TASKS: OnceCell<UnboundedSender<JoinHandle<()>>> = OnceCell::new();
static TASKS_RUNNING: Lazy<Semaphore> = Lazy::new(|| Semaphore::new(0));
static REQUEST_TICKETS: Lazy<Semaphore> = Lazy::new(|| Semaphore::new(0));
pub async fn get_request_ticket() {
REQUEST_TICKETS.acquire().await.unwrap().forget();
}
macro_rules! spawn {
($e:expr) => {
TASKS.get().unwrap().unbounded_send(task::spawn($e)).unwrap();
};
}
#[tokio::main]
async fn main() {
let opt = Opt::from_args();
let rate = opt.rate;
task::spawn(async move {
let mut interval = time::interval(time::Duration::from_secs_f64(60.0 / rate as f64));
loop {
interval.tick().await;
REQUEST_TICKETS.add_permits(1);
}
});
if let Err(e) = real_main(opt).await {
error!(e);
}
@ -105,7 +72,7 @@ async fn login(opt: Opt, ignore: Gitignore) -> Result<ILIAS> {
error!(e)
} else {
success!("Session still active!");
return Ok(ilias)
return Ok(ilias);
}
},
Err(e) => warning!(e),
@ -141,9 +108,13 @@ async fn real_main(mut opt: Opt) -> Result<()> {
#[cfg(windows)]
let _ = colored::control::set_virtual_terminal(true);
create_dir(&opt.output).await.context("failed to create output directory")?;
create_dir(&opt.output)
.await
.context("failed to create output directory")?;
// use UNC paths on Windows (to avoid the default max. path length of 255)
opt.output = fs::canonicalize(opt.output).await.context("failed to canonicalize output directory")?;
opt.output = fs::canonicalize(opt.output)
.await
.context("failed to canonicalize output directory")?;
// load .iliasignore file
let (ignore, error) = Gitignore::new(opt.output.join(".iliasignore"));
@ -151,6 +122,8 @@ async fn real_main(mut opt: Opt) -> Result<()> {
warning!(err);
}
queue::set_download_rate(opt.rate);
let ilias = login(opt, ignore).await?;
if ilias.opt.content_tree {
@ -162,9 +135,7 @@ async fn real_main(mut opt: Opt) -> Result<()> {
}
}
let ilias = Arc::new(ilias);
let (tx, mut rx) = futures_channel::mpsc::unbounded::<JoinHandle<()>>();
TASKS.get_or_init(|| tx.clone());
TASKS_RUNNING.add_permits(ilias.opt.jobs);
let mut rx = queue::set_parallel_jobs(ilias.opt.jobs);
PROGRESS_BAR_ENABLED.store(atty::is(atty::Stream::Stdout), Ordering::SeqCst);
if PROGRESS_BAR_ENABLED.load(Ordering::SeqCst) {
PROGRESS_BAR.set_draw_target(ProgressDrawTarget::stderr_nohz());
@ -173,8 +144,13 @@ async fn real_main(mut opt: Opt) -> Result<()> {
}
let sync_url = ilias.opt.sync_url.as_deref().unwrap_or(DEFAULT_SYNC_URL);
let obj = Object::from_url(URL::from_href(sync_url).context("invalid sync URL")?, String::new(), None).context("invalid sync object")?; // name can be empty for first element
spawn!(process_gracefully(ilias.clone(), ilias.opt.output.clone(), obj));
let obj = Object::from_url(
URL::from_href(sync_url).context("invalid sync URL")?,
String::new(),
None,
)
.context("invalid sync object")?;
queue::spawn(process_gracefully(ilias.clone(), ilias.opt.output.clone(), obj));
while let Either::Left((task, _)) = future::select(rx.next(), future::ready(())).await {
if let Some(task) = task {
@ -208,11 +184,11 @@ async fn real_main(mut opt: Opt) -> Result<()> {
// https://github.com/rust-lang/rust/issues/53690#issuecomment-418911229
#[allow(clippy::manual_async_fn)]
fn process_gracefully(ilias: Arc<ILIAS>, path: PathBuf, obj: Object) -> impl Future<Output = ()> + Send {
async move {
if PROGRESS_BAR_ENABLED.load(Ordering::SeqCst) {
PROGRESS_BAR.inc_length(1);
}
let permit = TASKS_RUNNING.acquire().await.unwrap();
async move {
let permit = queue::get_ticket();
let path_text = path.to_string_lossy().into_owned();
if let Err(e) = process(ilias, path, obj).await.context("failed to process URL") {
error!("Syncing {}", path_text; e);
@ -227,46 +203,9 @@ async fn handle_gracefully(fut: impl Future<Output = Result<()>>) {
}
}
#[allow(non_upper_case_globals)]
mod selectors {
use once_cell::sync::Lazy;
use regex::Regex;
use scraper::Selector;
// construct CSS selectors once
pub static LINKS: Lazy<Selector> = Lazy::new(|| Selector::parse("a").unwrap());
pub static a_target_blank: Lazy<Selector> = Lazy::new(|| Selector::parse(r#"a[target="_blank"]"#).unwrap());
pub static IMAGES: Lazy<Selector> = Lazy::new(|| Selector::parse("img").unwrap());
pub static TABLES: Lazy<Selector> = Lazy::new(|| Selector::parse("table").unwrap());
pub static VIDEO_ROWS: Lazy<Selector> = Lazy::new(|| Selector::parse(".ilTableOuter > div > table > tbody > tr").unwrap());
pub static links_in_table: Lazy<Selector> = Lazy::new(|| Selector::parse("tbody tr td a").unwrap());
pub static th: Lazy<Selector> = Lazy::new(|| Selector::parse("th").unwrap());
pub static td: Lazy<Selector> = Lazy::new(|| Selector::parse("td").unwrap());
pub static tr: Lazy<Selector> = Lazy::new(|| Selector::parse("tr").unwrap());
pub static post_row: Lazy<Selector> = Lazy::new(|| Selector::parse(".ilFrmPostRow").unwrap());
pub static post_title: Lazy<Selector> = Lazy::new(|| Selector::parse(".ilFrmPostTitle").unwrap());
pub static post_container: Lazy<Selector> = Lazy::new(|| Selector::parse(".ilFrmPostContentContainer").unwrap());
pub static post_attachments: Lazy<Selector> = Lazy::new(|| Selector::parse(".ilFrmPostAttachmentsContainer").unwrap());
pub static span_small: Lazy<Selector> = Lazy::new(|| Selector::parse("span.small").unwrap());
pub static forum_pages: Lazy<Selector> = Lazy::new(|| Selector::parse("div.ilTableNav > table > tbody > tr > td > a").unwrap());
pub static alert_danger: Lazy<Selector> = Lazy::new(|| Selector::parse("div.alert-danger").unwrap());
pub static form_group: Lazy<Selector> = Lazy::new(|| Selector::parse(".form-group").unwrap());
pub static form_name: Lazy<Selector> = Lazy::new(|| Selector::parse(".il_InfoScreenProperty").unwrap());
pub static cmd_node_regex: Lazy<Regex> = Lazy::new(|| Regex::new(r#"cmdNode=uf:\w\w"#).unwrap());
pub static image_src_regex: Lazy<Regex> = Lazy::new(|| Regex::new(r#"\./data/produktiv/mobs/mm_(\d+)/([^?]+).+"#).unwrap());
pub static XOCT_REGEX: Lazy<Regex> = Lazy::new(|| Regex::new(r#"(?m)<script>\s+xoctPaellaPlayer\.init\(([\s\S]+)\)\s+</script>"#).unwrap());
pub static il_content_container: Lazy<Selector> = Lazy::new(|| Selector::parse("#il_center_col").unwrap());
pub static item_prop: Lazy<Selector> = Lazy::new(|| Selector::parse("span.il_ItemProperty").unwrap());
pub static container_items: Lazy<Selector> = Lazy::new(|| Selector::parse("div.il_ContainerListItem").unwrap());
pub static container_item_title: Lazy<Selector> = Lazy::new(|| Selector::parse("a.il_ContainerItemTitle").unwrap());
}
use crate::selectors::*;
const NO_ENTRIES: &str = "Keine Einträge";
async fn process(ilias: Arc<ILIAS>, path: PathBuf, obj: Object) -> Result<()> {
let relative_path = path.strip_prefix(&ilias.opt.output).unwrap();
if PROGRESS_BAR_ENABLED.load(Ordering::SeqCst) {
PROGRESS_BAR.inc(1);
let path = relative_path.display().to_string();
if !path.is_empty() {
PROGRESS_BAR.set_message(path);
@ -284,447 +223,31 @@ async fn process(ilias: Arc<ILIAS>, path: PathBuf, obj: Object) -> Result<()> {
}
match &obj {
Course { url, name } => {
let content = if ilias.opt.content_tree {
let html = ilias.download(&url.url).await?.text().await?;
let cmd_node = cmd_node_regex.find(&html).context("can't find cmdNode")?.as_str()[8..].to_owned();
let content_tree = ilias.get_course_content_tree(&url.ref_id, &cmd_node).await;
match content_tree {
Ok(tree) => (tree.into_iter().map(Result::Ok).collect(), None),
Err(e) => {
// some folders are hidden on the course page and can only be found via the RSS feed / recent activity / content tree sidebar
// TODO: this is probably never the case for folders?
if html.contains(r#"input[name="cmd[join]""#) {
return Ok(()); // ignore groups we are not in
}
warning!(name, "falling back to incomplete course content extractor!", e);
ilias.get_course_content(&url).await? // TODO: perhaps don't download almost the same content 3x
},
}
} else {
ilias.get_course_content(&url).await?
};
if let Some(s) = content.1.as_ref() {
let path = path.join("course.html");
write_file_data(&path, &mut s.as_bytes())
.await
.context("failed to write course page html")?;
}
for item in content.0 {
let item = item?;
let path = path.join(file_escape(item.name()));
let ilias = Arc::clone(&ilias);
spawn!(process_gracefully(ilias, path, item));
}
ilias::course::download(path, ilias, url, name).await?;
},
Folder { url, .. } | PersonalDesktop { url } => {
let content = ilias.get_course_content(&url).await?;
if let Some(s) = content.1.as_ref() {
let path = path.join("folder.html");
write_file_data(&path, &mut s.as_bytes())
.await
.context("failed to write folder page html")?;
}
for item in content.0 {
let item = item?;
let path = path.join(file_escape(item.name()));
let ilias = Arc::clone(&ilias);
spawn!(process_gracefully(ilias, path, item));
}
ilias::folder::download(&path, ilias, url).await?;
},
File { url, .. } => {
if ilias.opt.skip_files {
return Ok(());
}
if !ilias.opt.force && fs::metadata(&path).await.is_ok() {
log!(2, "Skipping download, file exists already");
return Ok(());
}
let data = ilias.download(&url.url).await?;
let mut reader = StreamReader::new(data.bytes_stream().map_err(|x| io::Error::new(io::ErrorKind::Other, x)));
log!(0, "Writing {}", relative_path.to_string_lossy());
write_file_data(&path, &mut reader).await?;
ilias::file::download(&path, relative_path, ilias, url).await?;
},
PluginDispatch { url, .. } => {
if ilias.opt.no_videos {
return Ok(());
}
let full_url = {
// first find the link to full video list
let list_url = format!("{}ilias.php?ref_id={}&cmdClass=xocteventgui&cmdNode=nc:n4:14u&baseClass=ilObjPluginDispatchGUI&lang=de&limit=20&cmd=asyncGetTableGUI&cmdMode=asynch", ILIAS_URL, url.ref_id);
log!(1, "Loading {}", list_url);
let data = ilias.download(&list_url).await?;
let html = data.text().await?;
let html = Html::parse_fragment(&html);
html.select(&LINKS)
.filter_map(|link| link.value().attr("href"))
.filter(|href| href.contains("trows=800"))
.map(|x| x.to_string())
.next()
.context("video list link not found")?
};
log!(1, "Rewriting {}", full_url);
let mut full_url = Url::parse(&format!("{}{}", ILIAS_URL, full_url))?;
let mut query_parameters = full_url.query_pairs().map(|(x, y)| (x.into_owned(), y.into_owned())).collect::<Vec<_>>();
for (key, value) in &mut query_parameters {
match key.as_ref() {
"cmd" => *value = "asyncGetTableGUI".into(),
"cmdClass" => *value = "xocteventgui".into(),
_ => {},
}
}
query_parameters.push(("cmdMode".into(), "asynch".into()));
full_url.query_pairs_mut().clear().extend_pairs(&query_parameters).finish();
log!(1, "Loading {}", full_url);
let data = ilias.download(full_url.as_str()).await?;
let html = data.text().await?;
let html = Html::parse_fragment(&html);
for row in html.select(&VIDEO_ROWS) {
let link = row.select(&a_target_blank).next();
if link.is_none() {
if !row.text().any(|x| x == NO_ENTRIES) {
warning!(format => "table row without link in {}", url.url);
}
continue;
}
let link = link.unwrap();
let mut cells = row.select(&td);
if let Some(title) = cells.nth(2) {
let title = title.text().collect::<String>();
let title = title.trim();
if title.starts_with("<div") {
continue;
}
let mut path = path.clone();
path.push(format!("{}.mp4", file_escape(title)));
log!(1, "Found video: {}", title);
let video = Video {
url: URL::raw(link.value().attr("href").context("video link without href")?.to_owned()),
};
let ilias = Arc::clone(&ilias);
spawn!(process_gracefully(ilias, path, video));
}
}
ilias::plugin_dispatch::download(&path, ilias, url).await?;
},
Video { url } => {
if ilias.opt.no_videos {
return Ok(());
}
if fs::metadata(&path).await.is_ok() && !(ilias.opt.force || ilias.opt.check_videos) {
log!(2, "Skipping download, file exists already");
return Ok(());
}
let url = format!("{}{}", ILIAS_URL, url.url);
let data = ilias.download(&url);
let html = data.await?.text().await?;
log!(2, "{}", html);
let json: serde_json::Value = {
let mut json_capture = XOCT_REGEX.captures_iter(&html);
let json = &json_capture.next().context("xoct player json not found")?[1];
log!(2, "{}", json);
let json = json.split(",\n").next().context("invalid xoct player json")?;
serde_json::from_str(&json.trim())?
};
log!(2, "{}", json);
let url = json
.pointer("/streams/0/sources/mp4/0/src")
.context("video src not found")?
.as_str()
.context("video src not string")?;
let meta = fs::metadata(&path).await;
if !ilias.opt.force && meta.is_ok() && ilias.opt.check_videos {
let head = ilias.head(url).await.context("HEAD request failed")?;
if let Some(len) = head.headers().get("content-length") {
if meta?.len() != len.to_str()?.parse::<u64>()? {
warning!(relative_path.to_string_lossy(), "was updated, consider moving the outdated file");
}
}
} else {
let resp = ilias.download(&url).await?;
let mut reader = StreamReader::new(resp.bytes_stream().map_err(|x| io::Error::new(io::ErrorKind::Other, x)));
log!(0, "Writing {}", relative_path.to_string_lossy());
write_file_data(&path, &mut reader).await?;
}
ilias::video::download(&path, relative_path, ilias, url).await?;
},
Forum { url, .. } => {
if !ilias.opt.forum {
return Ok(());
}
let url = &url.url;
let html = {
let data = ilias.download(url);
let html_text = data.await?.text().await?;
let url = {
let html = Html::parse_document(&html_text);
let thread_count_selector = html.select(&LINKS).flat_map(|x| x.value().attr("href")).find(|x| x.contains("trows=800"));
if thread_count_selector.is_none() {
if let Some(cell) = html.select(&td).next() {
if cell.text().any(|x| x == NO_ENTRIES) {
return Ok(()); // empty forum
}
}
}
thread_count_selector
.context("can't find forum thread count selector (empty forum?)")?
.to_owned()
};
let data = ilias.download(&url);
let html = data.await?.text().await?;
Html::parse_document(&html)
};
for row in html.select(&tr) {
if row.value().attr("class") == Some("hidden-print") {
continue; // thread count
}
if row.select(&th).next().is_some() {
continue; // table header
}
let cells = row.select(&td).collect::<Vec<_>>();
if cells.len() != 6 {
warning!(format =>
"Warning: {}{} {} {}",
"unusual table row (", cells.len(), "cells) in", url.to_string()
);
continue;
}
let link = cells[1].select(&LINKS).next().context("thread link not found")?;
let object = Object::from_link(link, link)?;
let mut path = path.clone();
let name = format!(
"{}_{}",
object.url().thr_pk.as_ref().context("thr_pk not found for thread")?,
link.text().collect::<String>().trim()
);
path.push(file_escape(&name));
// TODO: set modification date?
let saved_posts = {
match std::fs::read_dir(&path) {
// TODO: make this async
Ok(stream) => stream.count(),
Err(_) => 0,
}
};
let available_posts = cells[3]
.text()
.next()
.unwrap_or_default()
.trim()
.parse::<usize>()
.context("parsing post count failed")?;
if available_posts <= saved_posts && !ilias.opt.force {
continue;
}
let ilias = Arc::clone(&ilias);
spawn!(process_gracefully(ilias, path, object));
}
if html.select(&forum_pages).count() > 0 {
log!(0, "Ignoring older threads in {:?}..", path);
}
ilias::forum::download(&path, ilias, url).await?;
},
Thread { url } => {
if !ilias.opt.forum {
return Ok(());
}
let mut all_images = Vec::new();
let mut attachments = Vec::new();
{
let html = ilias.get_html(&url.url).await?;
for post in html.select(&post_row) {
let title = post
.select(&post_title)
.next()
.context("post title not found")?
.text()
.collect::<String>();
let author = post.select(&span_small).next().context("post author not found")?;
let author = author.text().collect::<String>();
let author = author.trim().split('|').collect::<Vec<_>>();
let author = if author.len() == 2 {
author[0] // pseudonymous forum
} else if author.len() == 3 {
if author[1] != "Pseudonym" {
author[1]
} else {
author[0]
}
} else {
return Err(anyhow!("author data in unknown format"));
}
.trim();
let container = post.select(&post_container).next().context("post container not found")?;
let link = container.select(&LINKS).next().context("post link not found")?;
let id = link.value().attr("id").context("no id in thread link")?.to_owned();
let name = format!("{}_{}_{}.html", id, author, title.trim());
let data = container.inner_html();
let path = path.join(file_escape(&name));
let relative_path = relative_path.join(file_escape(&name));
spawn!(handle_gracefully(async move {
log!(0, "Writing {}", relative_path.display());
write_file_data(&path, &mut data.as_bytes()).await.context("failed to write forum post")
}));
let images = container.select(&IMAGES).map(|x| x.value().attr("src").map(|x| x.to_owned()));
for image in images {
let image = image.context("no src on image")?;
all_images.push((id.clone(), image));
}
if let Some(container) = container.select(&post_attachments).next() {
for attachment in container.select(&LINKS) {
attachments.push((
id.clone(),
attachment.text().collect::<String>(),
attachment.value().attr("href").map(|x| x.to_owned()),
));
}
}
}
// pagination
if let Some(pages) = html.select(&TABLES).next() {
if let Some(last) = pages.select(&links_in_table).last() {
let text = last.text().collect::<String>();
if text.trim() == ">>" {
// not last page yet
let ilias = Arc::clone(&ilias);
let next_page = Thread {
url: URL::from_href(last.value().attr("href").context("page link not found")?)?,
};
spawn!(process_gracefully(ilias, path.clone(), next_page));
}
} else {
log!(
0,
"Warning: {} {}",
"unable to find pagination links in".bright_yellow(),
url.url.to_string().bright_yellow()
);
}
}
}
for (id, image) in all_images {
let src = URL::from_href(&image)?;
let dl = ilias.download(&src.url).await?;
let mut path = path.clone();
if let Some(m) = image_src_regex.captures(&image) {
// image uploaded to ILIAS
let (media_id, filename) = (m.get(1).unwrap().as_str(), m.get(2).unwrap().as_str());
path.push(file_escape(&format!("{}_{}_{}", id, media_id, filename)));
} else {
// external image
path.push(file_escape(&format!("{}_{}", id, image)));
}
spawn!(handle_gracefully(async move {
let bytes = dl.bytes().await?;
write_file_data(&path, &mut &*bytes)
.await
.context("failed to write forum post image attachment")
}));
}
for (id, name, url) in attachments {
let url = url.context("attachment without href")?;
let src = URL::from_href(&url)?;
let dl = ilias.download(&src.url).await?;
let mut path = path.clone();
path.push(file_escape(&format!("{}_{}", id, name)));
spawn!(handle_gracefully(async move {
let bytes = dl.bytes().await?;
write_file_data(&path, &mut &*bytes)
.await
.context("failed to write forum post file attachment")
}));
}
ilias::thread::download(&path, relative_path, ilias, url).await?;
},
ExerciseHandler { url, .. } => {
let html = ilias.get_html(&url.url).await?;
let mut filenames = HashSet::new();
for row in html.select(&form_group) {
let link = row.select(&LINKS).next();
if link.is_none() {
continue;
}
let link = link.unwrap();
let href = link.value().attr("href");
if href.is_none() {
continue;
}
let href = href.unwrap();
let url = URL::from_href(href)?;
let cmd = url.cmd.as_deref().unwrap_or("");
if cmd != "downloadFile" && cmd != "downloadGlobalFeedbackFile" && cmd != "downloadFeedbackFile" {
continue;
}
// link is definitely just a download link to the exercise or the solution
let name = row
.select(&form_name)
.next()
.context("link without file name")?
.text()
.collect::<String>()
.trim()
.to_owned();
let item = File { url, name };
let mut path = path.clone();
// handle files with the same name
let filename = file_escape(item.name());
let mut parts = filename.rsplitn(2, '.');
let extension = parts.next().unwrap_or(&filename);
let name = parts.next().unwrap_or("");
let mut unique_filename = filename.clone();
let mut i = 1;
while filenames.contains(&unique_filename) {
i += 1;
if name.is_empty() {
unique_filename = format!("{}{}", extension, i);
} else {
unique_filename = format!("{}{}.{}", name, i, extension);
}
}
filenames.insert(unique_filename.clone());
path.push(unique_filename);
let ilias = Arc::clone(&ilias);
spawn!(process_gracefully(ilias, path, item));
}
ilias::exercise::download(&path, ilias, url).await?;
},
Weblink { url, .. } => {
if !ilias.opt.force && fs::metadata(&path).await.is_ok() {
log!(2, "Skipping download, link exists already");
return Ok(());
}
let head_req_result = ilias.head(&url.url).await;
let url = match &head_req_result {
Err(e) => e.url().context("HEAD request failed")?.as_str(),
Ok(head) => head.url().as_str(),
};
if url.starts_with(ILIAS_URL) {
// is a link list
if fs::metadata(&path).await.is_err() {
create_dir(&path).await?;
log!(0, "Writing {}", relative_path.to_string_lossy());
}
let urls = {
let html = ilias.get_html(url).await?;
html.select(&LINKS)
.filter_map(|x| x.value().attr("href").map(|y| (y, x.text().collect::<String>())))
.map(|(x, y)| URL::from_href(x).map(|z| (z, y.trim().to_owned())).context("parsing weblink"))
.collect::<Result<Vec<_>>>()
}?;
for (url, name) in urls {
if url.cmd.as_deref().unwrap_or("") != "callLink" {
continue;
}
let head = ilias.head(url.url.as_str()).await.context("HEAD request to web link failed");
if let Some(err) = head.as_ref().err() {
warning!(err);
continue;
}
let head = head.unwrap();
let url = head.url().as_str();
write_file_data(path.join(file_escape(&name)), &mut url.as_bytes()).await?;
}
} else {
log!(0, "Writing {}", relative_path.to_string_lossy());
write_file_data(&path, &mut url.as_bytes()).await.context("failed to save weblink URL")?;
}
ilias::weblink::download(&path, relative_path, ilias, url).await?;
},
Wiki { .. } => {
log!(1, "Ignored wiki!");
@ -733,11 +256,17 @@ async fn process(ilias: Arc<ILIAS>, path: PathBuf, obj: Object) -> Result<()> {
log!(1, "Ignored survey!");
},
Presentation { .. } => {
log!(1, "Ignored interactive presentation! (visit it yourself, it's probably interesting)");
log!(
1,
"Ignored interactive presentation! (visit it yourself, it's probably interesting)"
);
},
Generic { .. } => {
log!(1, "Ignored generic {:?}", obj)
},
}
if PROGRESS_BAR_ENABLED.load(Ordering::SeqCst) {
PROGRESS_BAR.inc(1);
}
Ok(())
}

50
src/queue.rs Normal file
View File

@ -0,0 +1,50 @@
use futures::Future;
use futures_channel::mpsc::{UnboundedReceiver, UnboundedSender};
use once_cell::sync::{Lazy, OnceCell};
use tokio::{
sync::{Semaphore, SemaphorePermit},
task::{self, JoinHandle},
time,
};
/// Global job queue
static TASKS: OnceCell<UnboundedSender<JoinHandle<()>>> = OnceCell::new();
static TASKS_RUNNING: Lazy<Semaphore> = Lazy::new(|| Semaphore::new(0));
static REQUEST_TICKETS: Lazy<Semaphore> = Lazy::new(|| Semaphore::new(0));
pub async fn get_request_ticket() {
REQUEST_TICKETS.acquire().await.unwrap().forget();
}
pub async fn get_ticket() -> SemaphorePermit<'static> {
TASKS_RUNNING.acquire().await.unwrap()
}
pub fn spawn(e: impl Future<Output = ()> + Send + 'static) {
TASKS.get().unwrap().unbounded_send(task::spawn(e)).unwrap();
}
pub fn set_download_rate(rate: usize) {
task::spawn(async move {
let mut interval = time::interval(time::Duration::from_secs_f64(60.0 / rate as f64));
loop {
interval.tick().await;
log!(
0,
"interval ticked @ {}",
std::time::SystemTime::now()
.duration_since(std::time::SystemTime::UNIX_EPOCH)
.unwrap()
.as_secs()
);
REQUEST_TICKETS.add_permits(1);
}
});
}
pub fn set_parallel_jobs(jobs: usize) -> UnboundedReceiver<JoinHandle<()>> {
let (tx, rx) = futures_channel::mpsc::unbounded::<JoinHandle<()>>();
TASKS.get_or_init(|| tx.clone());
TASKS_RUNNING.add_permits(jobs);
rx
}

View File

@ -1,21 +1,38 @@
// SPDX-License-Identifier: GPL-3.0-or-later
use anyhow::Context;
use bytes::Bytes;
use futures::TryStreamExt;
use tokio::fs::File as AsyncFile;
use tokio::io::{AsyncRead, BufWriter};
use tokio_util::io::StreamReader;
use std::io;
use std::path::Path;
use crate::Result;
pub async fn write_stream_to_file(
path: &Path,
stream: impl futures::Stream<Item = Result<Bytes, reqwest::Error>> + Unpin,
) -> Result<()> {
let mut reader = StreamReader::new(stream.map_err(|x| io::Error::new(io::ErrorKind::Other, x)));
write_file_data(&path, &mut reader).await?;
Ok(())
}
/// Write all data to the specified path. Will overwrite previous file data.
pub async fn write_file_data<R: ?Sized>(path: impl AsRef<Path>, data: &mut R) -> Result<()>
where
R: AsyncRead + Unpin,
{
let file = AsyncFile::create(path.as_ref()).await.context("failed to create file")?;
let file = AsyncFile::create(path.as_ref())
.await
.context("failed to create file")?;
let mut file = BufWriter::new(file);
tokio::io::copy(data, &mut file).await.context("failed to write to file")?;
tokio::io::copy(data, &mut file)
.await
.context("failed to write to file")?;
Ok(())
}