From e0f91f74814baef3eb4ddd0189ee12bc23f08afc Mon Sep 17 00:00:00 2001 From: Kitaiti Makoto Date: Sun, 28 Mar 2021 20:54:53 +0900 Subject: [PATCH] Add remove-dup-images command --- plume-cli/Cargo.toml | 4 + plume-cli/src/remove-dup-images.rs | 149 +++++++++++++++++++++++++++++ 2 files changed, 153 insertions(+) create mode 100644 plume-cli/src/remove-dup-images.rs diff --git a/plume-cli/Cargo.toml b/plume-cli/Cargo.toml index 23bed8c0..d1db947f 100644 --- a/plume-cli/Cargo.toml +++ b/plume-cli/Cargo.toml @@ -8,6 +8,10 @@ edition = "2018" name = "plm" path = "src/main.rs" +[[bin]] +name = "remove-dup-images" +path = "src/remove-dup-images.rs" + [dependencies] clap = "2.33" dotenv = "0.14" diff --git a/plume-cli/src/remove-dup-images.rs b/plume-cli/src/remove-dup-images.rs new file mode 100644 index 00000000..f5fc5a2b --- /dev/null +++ b/plume-cli/src/remove-dup-images.rs @@ -0,0 +1,149 @@ +use diesel::{ + BoolExpressionMethods, Connection, ExpressionMethods, JoinOnDsl, NullableExpressionMethods, + QueryDsl, RunQueryDsl, +}; +use plume_models::{ + blogs::Blog, instance::Instance, medias::Media, posts::Post, Connection as Conn, CONFIG, +}; +use std::collections::hash_map::{DefaultHasher, HashMap}; +use std::fs::File; +use std::hash::Hasher; +use std::io::{BufReader, Read}; +use std::path::Path; + +fn main() { + match dotenv::dotenv() { + Ok(path) => eprintln!("Configuration read from {}", path.display()), + Err(ref e) if e.not_found() => eprintln!("no .env was found"), + e => e.map(|_| ()).unwrap(), + } + let conn = Conn::establish(CONFIG.database_url.as_str()).expect("extablish connection"); + Instance::cache_local(&conn); + let covers = get_remote_post_covers(&conn); + let remote_media_hashes = calculate_remote_media_hashes(covers); + eprintln!("remote medias: {:?}", remote_media_hashes); + let orphan_medias = get_orphan_medias(&conn); + eprintln!("{:?} orphan media(s)", orphan_medias.len()); + for media in orphan_medias { + match calculate_file_hash(&Path::new(&media.file_path)) { + Some(hash) => { + match remote_media_hashes.get(&hash) { + Some(file_path) => { + eprintln!( + "File already referred. Removes only medias record. {}", + &file_path + ); + // Remove medias record + diesel::delete(&media) + .execute(&conn) + .expect("Delete medias record"); + } + None => { + eprintln!("Removes {}", &media.file_path); + // Remove file and medias record + media.delete(&conn).expect("Delete media record and file"); + } + } + } + None => { + eprintln!( + "File doesn't exist. Removes medias record. medias.id: {}, path: {}", + &media.id, &media.file_path + ); + diesel::delete(&media) + .execute(&conn) + .expect("Delete medias record"); + } + } + } +} + +fn get_remote_post_covers(conn: &Conn) -> Vec { + use plume_models::schema::blogs; + use plume_models::schema::posts; + + let remote_instances = Instance::get_remotes(&conn).expect("get remote instances"); + let remote_instance_ids = remote_instances.iter().map(|instance| instance.id); + let remote_blogs = blogs::table + .filter(blogs::instance_id.eq_any(remote_instance_ids)) + .load::(conn) + .expect("remote blogs"); + let remote_blog_ids = remote_blogs.iter().map(|blog| blog.id); + let remote_posts = posts::table + .filter(posts::blog_id.eq_any(remote_blog_ids)) + .load::(conn) + .expect("remote posts"); + remote_posts + .iter() + .filter_map(|post| post.cover_id) + .map(|cover_id| Media::get(conn, cover_id).expect("Media")) + .collect() +} + +fn calculate_remote_media_hashes(medias: Vec) -> HashMap { + let mut media_hashes = HashMap::new(); + for media in medias.iter() { + if let Some(hash) = calculate_file_hash(Path::new(&media.file_path)) { + let _ = media_hashes.insert(hash, media.file_path.clone()); + } + } + media_hashes +} + +fn calculate_file_hash(path: &Path) -> Option { + if !path.exists() { + return None; + } + let file = File::open(path).expect("open file"); + let mut reader = BufReader::new(file); + let mut hasher = DefaultHasher::new(); + let mut buffer = [0; 2048]; + + while let Ok(n) = reader.read(&mut buffer) { + hasher.write(&buffer); + + if n == 0 { + break; + } + } + Some(hasher.finish()) +} + +fn get_orphan_medias(conn: &Conn) -> Vec { + use plume_models::schema::{self, medias}; + use plume_models::schema::{blogs::dsl::blogs, posts::dsl::posts, users::dsl::users}; + let query = medias::table + .select(( + medias::id, + medias::file_path, + medias::alt_text, + medias::is_remote, + medias::remote_url, + medias::sensitive, + medias::content_warning, + medias::owner_id, + )) + .left_outer_join(users.on(schema::users::avatar_id.eq(medias::id.nullable()))) + .left_outer_join( + blogs.on(schema::blogs::icon_id + .eq(medias::id.nullable()) + .or(schema::blogs::banner_id.eq(medias::id.nullable()))), + ) + .left_outer_join(posts.on(schema::posts::cover_id.eq(medias::id.nullable()))) + .filter( + schema::users::avatar_id.is_null().and( + schema::blogs::icon_id.is_null().and( + schema::blogs::banner_id.is_null().and( + schema::posts::cover_id + .is_null() + .and(medias::is_remote.eq(false)), + ), + ), + ), + ); + eprintln!( + "query for orphan medias: {}", + diesel::debug_query::<_, _>(&query) + ); + query.load::(conn).expect("Load orphan medias") +}