Loading Cargo.lock +37 −0 Original line number Diff line number Diff line Loading @@ -74,6 +74,7 @@ dependencies = [ "data-encoding", "derive_more", "directories", "docx-rs", "dotenvy", "duckdb", "exitcode", Loading Loading @@ -1954,6 +1955,20 @@ dependencies = [ "litrs", ] [[package]] name = "docx-rs" version = "0.4.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed73cbf5e1c37baa23f4132569ac1187829f03922c206bd68fe109e3001a343d" dependencies = [ "base64 0.22.1", "quick-xml 0.36.2", "serde", "serde_json", "thiserror 2.0.18", "zip 0.6.6", ] [[package]] name = "dotenvy" version = "0.15.7" Loading Loading @@ -5343,6 +5358,16 @@ dependencies = [ "memchr", ] [[package]] name = "quick-xml" version = "0.36.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f7649a7b4df05aed9ea7ec6f628c67c9953a43869b8bc50929569b2999d443fe" dependencies = [ "encoding_rs", "memchr", ] [[package]] name = "quick-xml" version = "0.39.1" Loading Loading @@ -9223,6 +9248,18 @@ dependencies = [ "time 0.1.45", ] [[package]] name = "zip" version = "0.6.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "760394e246e4c28189f19d488c058bf16f564016aefac5d32bb1f3b51d5e9261" dependencies = [ "byteorder", "crc32fast", "crossbeam-utils", "flate2", ] [[package]] name = "zip" version = "6.0.0" Loading Cargo.toml +1 −0 Original line number Diff line number Diff line Loading @@ -37,6 +37,7 @@ derive_more = { version = "2.1.1", features = [ "from_str", ], default-features = false } dotenvy = { version = "0.15.0", default-features = false } docx-rs = { version = "0.4.17", default-features = false } duckdb = { version = "1.10502.0", default-features = false } fancy-regex = { version = "0.17.0", default-features = false } futures = { version = "0.3.32", features = ["alloc"], default-features = false } Loading acorn-cli/src/commands/export/powerpoint.rs +2 −2 Original line number Diff line number Diff line Loading @@ -3,9 +3,9 @@ //! This module generates PPTX output by copying slide templates, updating //! OOXML relationships, and interpolating Research Activity data. use crate::cli::CommandOptions; use acorn::io::powerpoint::ooxml::{Relationship, Relationships}; use acorn::io::powerpoint::{interpolate_values, read_xml_rel}; use acorn::io::{archive, extract_zip, read_file, write_file, ApiResult, InputOutput}; use acorn::powerpoint::ooxml::{Relationship, Relationships}; use acorn::powerpoint::{interpolate_values, read_xml_rel}; use acorn::prelude::{copy, create_dir_all, exit, Path, PathBuf}; use acorn::schema::research_activity::ResearchActivity; use acorn::util::{Label, StringConversion}; Loading acorn-lib/Cargo.toml +1 −0 Original line number Diff line number Diff line Loading @@ -40,6 +40,7 @@ data-encoding = { workspace = true } derive_more = { workspace = true } directories = { version = "6.0.0", default-features = false } dotenvy = { workspace = true } docx-rs = { workspace = true } duckdb = { workspace = true, features = ["bundled"], default-features = false, optional = true } fancy-regex = { workspace = true } flate2 = { version = "1.1.5", default-features = false } Loading acorn-lib/src/io/docx/mod.rs 0 → 100644 +159 −0 Original line number Diff line number Diff line //! DOCX input utilities. use crate::io::ApiResult; use crate::prelude::{read, PathBuf}; use color_eyre::eyre::eyre; use docx_rs::{ read_docx, Delete, DeleteChild, DocumentChild, Hyperlink, Insert, InsertChild, Paragraph, ParagraphChild, Run, RunChild, Table, TableCell, TableCellContent, TableChild, TableRow, TableRowChild, }; fn extract_from_table(table: &Table) -> String { table .rows .iter() .map(|row| match row { | TableChild::TableRow(value) => extract_from_table_row(value), }) .collect::<Vec<_>>() .join("\n") } fn extract_from_table_row(row: &TableRow) -> String { row.cells .iter() .map(|cell| match cell { | TableRowChild::TableCell(value) => extract_from_table_cell(value), }) .collect::<Vec<_>>() .join("\n") } fn extract_from_table_cell(cell: &TableCell) -> String { cell.children .iter() .map(|content| match content { | TableCellContent::Paragraph(paragraph) => extract_from_paragraph(paragraph), | TableCellContent::Table(table) => extract_from_table(table), | TableCellContent::StructuredDataTag(_) | TableCellContent::TableOfContents(_) => String::new(), }) .collect::<Vec<_>>() .join("\n") } fn extract_from_paragraph(paragraph: &Paragraph) -> String { paragraph .children .iter() .map(|child| match child { | ParagraphChild::Run(run) => extract_from_run(run), | ParagraphChild::Hyperlink(hyperlink) => extract_from_hyperlink(hyperlink), | ParagraphChild::Insert(insert) => extract_from_insert(insert), | ParagraphChild::Delete(delete) => extract_from_delete(delete), | ParagraphChild::PageNum(_) | ParagraphChild::NumPages(_) => " ".to_string(), | ParagraphChild::BookmarkStart(_) | ParagraphChild::BookmarkEnd(_) | ParagraphChild::CommentStart(_) | ParagraphChild::CommentEnd(_) | ParagraphChild::StructuredDataTag(_) => "".to_string(), }) .collect::<String>() } fn extract_from_hyperlink(hyperlink: &Hyperlink) -> String { hyperlink .children .iter() .map(|child| match child { | ParagraphChild::Run(run) => extract_from_run(run), | ParagraphChild::Insert(insert) => extract_from_insert(insert), | ParagraphChild::Delete(delete) => extract_from_delete(delete), | ParagraphChild::Hyperlink(_) => "".to_string(), | ParagraphChild::PageNum(_) | ParagraphChild::NumPages(_) => " ".to_string(), | ParagraphChild::BookmarkStart(_) | ParagraphChild::BookmarkEnd(_) | ParagraphChild::CommentStart(_) | ParagraphChild::CommentEnd(_) | ParagraphChild::StructuredDataTag(_) => "".to_string(), }) .collect::<String>() } fn extract_from_insert(insert: &Insert) -> String { insert .children .iter() .map(|child| match child { | InsertChild::Run(run) => extract_from_run(run), | InsertChild::Delete(delete) => extract_from_delete(delete), | InsertChild::CommentStart(_) | InsertChild::CommentEnd(_) => "".to_string(), }) .collect::<String>() } fn extract_from_delete(delete: &Delete) -> String { delete .children .iter() .map(|child| match child { | DeleteChild::Run(run) => extract_from_run(run), | DeleteChild::CommentStart(_) | DeleteChild::CommentEnd(_) => "".to_string(), }) .collect::<String>() } fn extract_from_run(run: &Run) -> String { run.children .iter() .map(|child| match child { | RunChild::Text(text) => text.text.clone(), | RunChild::InstrTextString(text) => text.clone(), | RunChild::Tab(_) => "\t".to_string(), | RunChild::Break(_) => "\n".to_string(), | RunChild::Sym(_) | RunChild::DeleteText(_) | RunChild::PTab(_) | RunChild::Drawing(_) | RunChild::Shape(_) | RunChild::CommentStart(_) | RunChild::CommentEnd(_) | RunChild::FieldChar(_) | RunChild::InstrText(_) | RunChild::DeleteInstrText(_) | RunChild::FootnoteReference(_) | RunChild::Shading(_) => "".to_string(), }) .collect::<String>() } /// Extract visible text from a DOCX file. /// /// The extracted output is paragraph-oriented and joined with newline separators. pub fn extract_text_from_path<P>(path: P) -> ApiResult<String> where P: Into<PathBuf>, { let path = path.into(); let bytes = read(path.clone()).map_err(|why| eyre!("Failed to read DOCX file {} - {why}", path.display()))?; let docx = read_docx(&bytes).map_err(|why| eyre!("Failed to parse DOCX file {} - {why}", path.display()))?; let text = docx .document .children .iter() .filter_map(|child| match child { | DocumentChild::Paragraph(p) => { let text = extract_from_paragraph(p).trim().to_string(); if text.is_empty() { None } else { Some(text) } } | DocumentChild::Table(t) => { let text = extract_from_table(t).trim().to_string(); if text.is_empty() { None } else { Some(text) } } | _ => None, }) .collect::<Vec<_>>() .join("\n"); Ok(text) } #[cfg(test)] mod tests; Loading
Cargo.lock +37 −0 Original line number Diff line number Diff line Loading @@ -74,6 +74,7 @@ dependencies = [ "data-encoding", "derive_more", "directories", "docx-rs", "dotenvy", "duckdb", "exitcode", Loading Loading @@ -1954,6 +1955,20 @@ dependencies = [ "litrs", ] [[package]] name = "docx-rs" version = "0.4.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed73cbf5e1c37baa23f4132569ac1187829f03922c206bd68fe109e3001a343d" dependencies = [ "base64 0.22.1", "quick-xml 0.36.2", "serde", "serde_json", "thiserror 2.0.18", "zip 0.6.6", ] [[package]] name = "dotenvy" version = "0.15.7" Loading Loading @@ -5343,6 +5358,16 @@ dependencies = [ "memchr", ] [[package]] name = "quick-xml" version = "0.36.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f7649a7b4df05aed9ea7ec6f628c67c9953a43869b8bc50929569b2999d443fe" dependencies = [ "encoding_rs", "memchr", ] [[package]] name = "quick-xml" version = "0.39.1" Loading Loading @@ -9223,6 +9248,18 @@ dependencies = [ "time 0.1.45", ] [[package]] name = "zip" version = "0.6.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "760394e246e4c28189f19d488c058bf16f564016aefac5d32bb1f3b51d5e9261" dependencies = [ "byteorder", "crc32fast", "crossbeam-utils", "flate2", ] [[package]] name = "zip" version = "6.0.0" Loading
Cargo.toml +1 −0 Original line number Diff line number Diff line Loading @@ -37,6 +37,7 @@ derive_more = { version = "2.1.1", features = [ "from_str", ], default-features = false } dotenvy = { version = "0.15.0", default-features = false } docx-rs = { version = "0.4.17", default-features = false } duckdb = { version = "1.10502.0", default-features = false } fancy-regex = { version = "0.17.0", default-features = false } futures = { version = "0.3.32", features = ["alloc"], default-features = false } Loading
acorn-cli/src/commands/export/powerpoint.rs +2 −2 Original line number Diff line number Diff line Loading @@ -3,9 +3,9 @@ //! This module generates PPTX output by copying slide templates, updating //! OOXML relationships, and interpolating Research Activity data. use crate::cli::CommandOptions; use acorn::io::powerpoint::ooxml::{Relationship, Relationships}; use acorn::io::powerpoint::{interpolate_values, read_xml_rel}; use acorn::io::{archive, extract_zip, read_file, write_file, ApiResult, InputOutput}; use acorn::powerpoint::ooxml::{Relationship, Relationships}; use acorn::powerpoint::{interpolate_values, read_xml_rel}; use acorn::prelude::{copy, create_dir_all, exit, Path, PathBuf}; use acorn::schema::research_activity::ResearchActivity; use acorn::util::{Label, StringConversion}; Loading
acorn-lib/Cargo.toml +1 −0 Original line number Diff line number Diff line Loading @@ -40,6 +40,7 @@ data-encoding = { workspace = true } derive_more = { workspace = true } directories = { version = "6.0.0", default-features = false } dotenvy = { workspace = true } docx-rs = { workspace = true } duckdb = { workspace = true, features = ["bundled"], default-features = false, optional = true } fancy-regex = { workspace = true } flate2 = { version = "1.1.5", default-features = false } Loading
acorn-lib/src/io/docx/mod.rs 0 → 100644 +159 −0 Original line number Diff line number Diff line //! DOCX input utilities. use crate::io::ApiResult; use crate::prelude::{read, PathBuf}; use color_eyre::eyre::eyre; use docx_rs::{ read_docx, Delete, DeleteChild, DocumentChild, Hyperlink, Insert, InsertChild, Paragraph, ParagraphChild, Run, RunChild, Table, TableCell, TableCellContent, TableChild, TableRow, TableRowChild, }; fn extract_from_table(table: &Table) -> String { table .rows .iter() .map(|row| match row { | TableChild::TableRow(value) => extract_from_table_row(value), }) .collect::<Vec<_>>() .join("\n") } fn extract_from_table_row(row: &TableRow) -> String { row.cells .iter() .map(|cell| match cell { | TableRowChild::TableCell(value) => extract_from_table_cell(value), }) .collect::<Vec<_>>() .join("\n") } fn extract_from_table_cell(cell: &TableCell) -> String { cell.children .iter() .map(|content| match content { | TableCellContent::Paragraph(paragraph) => extract_from_paragraph(paragraph), | TableCellContent::Table(table) => extract_from_table(table), | TableCellContent::StructuredDataTag(_) | TableCellContent::TableOfContents(_) => String::new(), }) .collect::<Vec<_>>() .join("\n") } fn extract_from_paragraph(paragraph: &Paragraph) -> String { paragraph .children .iter() .map(|child| match child { | ParagraphChild::Run(run) => extract_from_run(run), | ParagraphChild::Hyperlink(hyperlink) => extract_from_hyperlink(hyperlink), | ParagraphChild::Insert(insert) => extract_from_insert(insert), | ParagraphChild::Delete(delete) => extract_from_delete(delete), | ParagraphChild::PageNum(_) | ParagraphChild::NumPages(_) => " ".to_string(), | ParagraphChild::BookmarkStart(_) | ParagraphChild::BookmarkEnd(_) | ParagraphChild::CommentStart(_) | ParagraphChild::CommentEnd(_) | ParagraphChild::StructuredDataTag(_) => "".to_string(), }) .collect::<String>() } fn extract_from_hyperlink(hyperlink: &Hyperlink) -> String { hyperlink .children .iter() .map(|child| match child { | ParagraphChild::Run(run) => extract_from_run(run), | ParagraphChild::Insert(insert) => extract_from_insert(insert), | ParagraphChild::Delete(delete) => extract_from_delete(delete), | ParagraphChild::Hyperlink(_) => "".to_string(), | ParagraphChild::PageNum(_) | ParagraphChild::NumPages(_) => " ".to_string(), | ParagraphChild::BookmarkStart(_) | ParagraphChild::BookmarkEnd(_) | ParagraphChild::CommentStart(_) | ParagraphChild::CommentEnd(_) | ParagraphChild::StructuredDataTag(_) => "".to_string(), }) .collect::<String>() } fn extract_from_insert(insert: &Insert) -> String { insert .children .iter() .map(|child| match child { | InsertChild::Run(run) => extract_from_run(run), | InsertChild::Delete(delete) => extract_from_delete(delete), | InsertChild::CommentStart(_) | InsertChild::CommentEnd(_) => "".to_string(), }) .collect::<String>() } fn extract_from_delete(delete: &Delete) -> String { delete .children .iter() .map(|child| match child { | DeleteChild::Run(run) => extract_from_run(run), | DeleteChild::CommentStart(_) | DeleteChild::CommentEnd(_) => "".to_string(), }) .collect::<String>() } fn extract_from_run(run: &Run) -> String { run.children .iter() .map(|child| match child { | RunChild::Text(text) => text.text.clone(), | RunChild::InstrTextString(text) => text.clone(), | RunChild::Tab(_) => "\t".to_string(), | RunChild::Break(_) => "\n".to_string(), | RunChild::Sym(_) | RunChild::DeleteText(_) | RunChild::PTab(_) | RunChild::Drawing(_) | RunChild::Shape(_) | RunChild::CommentStart(_) | RunChild::CommentEnd(_) | RunChild::FieldChar(_) | RunChild::InstrText(_) | RunChild::DeleteInstrText(_) | RunChild::FootnoteReference(_) | RunChild::Shading(_) => "".to_string(), }) .collect::<String>() } /// Extract visible text from a DOCX file. /// /// The extracted output is paragraph-oriented and joined with newline separators. pub fn extract_text_from_path<P>(path: P) -> ApiResult<String> where P: Into<PathBuf>, { let path = path.into(); let bytes = read(path.clone()).map_err(|why| eyre!("Failed to read DOCX file {} - {why}", path.display()))?; let docx = read_docx(&bytes).map_err(|why| eyre!("Failed to parse DOCX file {} - {why}", path.display()))?; let text = docx .document .children .iter() .filter_map(|child| match child { | DocumentChild::Paragraph(p) => { let text = extract_from_paragraph(p).trim().to_string(); if text.is_empty() { None } else { Some(text) } } | DocumentChild::Table(t) => { let text = extract_from_table(t).trim().to_string(); if text.is_empty() { None } else { Some(text) } } | _ => None, }) .collect::<Vec<_>>() .join("\n"); Ok(text) } #[cfg(test)] mod tests;