Commit 3ccc7979 authored by Wohlgemuth, Jason's avatar Wohlgemuth, Jason
Browse files

feat: Refactor powerpoint to be under io and create initial docx module

parent 98481626
Loading
Loading
Loading
Loading
Loading
+37 −0
Original line number Diff line number Diff line
@@ -74,6 +74,7 @@ dependencies = [
 "data-encoding",
 "derive_more",
 "directories",
 "docx-rs",
 "dotenvy",
 "duckdb",
 "exitcode",
@@ -1954,6 +1955,20 @@ dependencies = [
 "litrs",
]

[[package]]
name = "docx-rs"
version = "0.4.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ed73cbf5e1c37baa23f4132569ac1187829f03922c206bd68fe109e3001a343d"
dependencies = [
 "base64 0.22.1",
 "quick-xml 0.36.2",
 "serde",
 "serde_json",
 "thiserror 2.0.18",
 "zip 0.6.6",
]

[[package]]
name = "dotenvy"
version = "0.15.7"
@@ -5343,6 +5358,16 @@ dependencies = [
 "memchr",
]

[[package]]
name = "quick-xml"
version = "0.36.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f7649a7b4df05aed9ea7ec6f628c67c9953a43869b8bc50929569b2999d443fe"
dependencies = [
 "encoding_rs",
 "memchr",
]

[[package]]
name = "quick-xml"
version = "0.39.1"
@@ -9223,6 +9248,18 @@ dependencies = [
 "time 0.1.45",
]

[[package]]
name = "zip"
version = "0.6.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "760394e246e4c28189f19d488c058bf16f564016aefac5d32bb1f3b51d5e9261"
dependencies = [
 "byteorder",
 "crc32fast",
 "crossbeam-utils",
 "flate2",
]

[[package]]
name = "zip"
version = "6.0.0"
+1 −0
Original line number Diff line number Diff line
@@ -37,6 +37,7 @@ derive_more = { version = "2.1.1", features = [
  "from_str",
], default-features = false }
dotenvy = { version = "0.15.0", default-features = false }
docx-rs = { version = "0.4.17", default-features = false }
duckdb = { version = "1.10502.0", default-features = false }
fancy-regex = { version = "0.17.0", default-features = false }
futures = { version = "0.3.32", features = ["alloc"], default-features = false }
+2 −2
Original line number Diff line number Diff line
@@ -3,9 +3,9 @@
//! This module generates PPTX output by copying slide templates, updating
//! OOXML relationships, and interpolating Research Activity data.
use crate::cli::CommandOptions;
use acorn::io::powerpoint::ooxml::{Relationship, Relationships};
use acorn::io::powerpoint::{interpolate_values, read_xml_rel};
use acorn::io::{archive, extract_zip, read_file, write_file, ApiResult, InputOutput};
use acorn::powerpoint::ooxml::{Relationship, Relationships};
use acorn::powerpoint::{interpolate_values, read_xml_rel};
use acorn::prelude::{copy, create_dir_all, exit, Path, PathBuf};
use acorn::schema::research_activity::ResearchActivity;
use acorn::util::{Label, StringConversion};
+1 −0
Original line number Diff line number Diff line
@@ -40,6 +40,7 @@ data-encoding = { workspace = true }
derive_more = { workspace = true }
directories = { version = "6.0.0", default-features = false }
dotenvy = { workspace = true }
docx-rs = { workspace = true }
duckdb = { workspace = true, features = ["bundled"], default-features = false, optional = true }
fancy-regex = { workspace = true }
flate2 = { version = "1.1.5", default-features = false }
+159 −0
Original line number Diff line number Diff line
//! DOCX input utilities.
use crate::io::ApiResult;
use crate::prelude::{read, PathBuf};
use color_eyre::eyre::eyre;
use docx_rs::{
    read_docx, Delete, DeleteChild, DocumentChild, Hyperlink, Insert, InsertChild, Paragraph, ParagraphChild, Run, RunChild, Table, TableCell,
    TableCellContent, TableChild, TableRow, TableRowChild,
};

fn extract_from_table(table: &Table) -> String {
    table
        .rows
        .iter()
        .map(|row| match row {
            | TableChild::TableRow(value) => extract_from_table_row(value),
        })
        .collect::<Vec<_>>()
        .join("\n")
}
fn extract_from_table_row(row: &TableRow) -> String {
    row.cells
        .iter()
        .map(|cell| match cell {
            | TableRowChild::TableCell(value) => extract_from_table_cell(value),
        })
        .collect::<Vec<_>>()
        .join("\n")
}
fn extract_from_table_cell(cell: &TableCell) -> String {
    cell.children
        .iter()
        .map(|content| match content {
            | TableCellContent::Paragraph(paragraph) => extract_from_paragraph(paragraph),
            | TableCellContent::Table(table) => extract_from_table(table),
            | TableCellContent::StructuredDataTag(_) | TableCellContent::TableOfContents(_) => String::new(),
        })
        .collect::<Vec<_>>()
        .join("\n")
}
fn extract_from_paragraph(paragraph: &Paragraph) -> String {
    paragraph
        .children
        .iter()
        .map(|child| match child {
            | ParagraphChild::Run(run) => extract_from_run(run),
            | ParagraphChild::Hyperlink(hyperlink) => extract_from_hyperlink(hyperlink),
            | ParagraphChild::Insert(insert) => extract_from_insert(insert),
            | ParagraphChild::Delete(delete) => extract_from_delete(delete),
            | ParagraphChild::PageNum(_) | ParagraphChild::NumPages(_) => " ".to_string(),
            | ParagraphChild::BookmarkStart(_)
            | ParagraphChild::BookmarkEnd(_)
            | ParagraphChild::CommentStart(_)
            | ParagraphChild::CommentEnd(_)
            | ParagraphChild::StructuredDataTag(_) => "".to_string(),
        })
        .collect::<String>()
}
fn extract_from_hyperlink(hyperlink: &Hyperlink) -> String {
    hyperlink
        .children
        .iter()
        .map(|child| match child {
            | ParagraphChild::Run(run) => extract_from_run(run),
            | ParagraphChild::Insert(insert) => extract_from_insert(insert),
            | ParagraphChild::Delete(delete) => extract_from_delete(delete),
            | ParagraphChild::Hyperlink(_) => "".to_string(),
            | ParagraphChild::PageNum(_) | ParagraphChild::NumPages(_) => " ".to_string(),
            | ParagraphChild::BookmarkStart(_)
            | ParagraphChild::BookmarkEnd(_)
            | ParagraphChild::CommentStart(_)
            | ParagraphChild::CommentEnd(_)
            | ParagraphChild::StructuredDataTag(_) => "".to_string(),
        })
        .collect::<String>()
}
fn extract_from_insert(insert: &Insert) -> String {
    insert
        .children
        .iter()
        .map(|child| match child {
            | InsertChild::Run(run) => extract_from_run(run),
            | InsertChild::Delete(delete) => extract_from_delete(delete),
            | InsertChild::CommentStart(_) | InsertChild::CommentEnd(_) => "".to_string(),
        })
        .collect::<String>()
}
fn extract_from_delete(delete: &Delete) -> String {
    delete
        .children
        .iter()
        .map(|child| match child {
            | DeleteChild::Run(run) => extract_from_run(run),
            | DeleteChild::CommentStart(_) | DeleteChild::CommentEnd(_) => "".to_string(),
        })
        .collect::<String>()
}
fn extract_from_run(run: &Run) -> String {
    run.children
        .iter()
        .map(|child| match child {
            | RunChild::Text(text) => text.text.clone(),
            | RunChild::InstrTextString(text) => text.clone(),
            | RunChild::Tab(_) => "\t".to_string(),
            | RunChild::Break(_) => "\n".to_string(),
            | RunChild::Sym(_)
            | RunChild::DeleteText(_)
            | RunChild::PTab(_)
            | RunChild::Drawing(_)
            | RunChild::Shape(_)
            | RunChild::CommentStart(_)
            | RunChild::CommentEnd(_)
            | RunChild::FieldChar(_)
            | RunChild::InstrText(_)
            | RunChild::DeleteInstrText(_)
            | RunChild::FootnoteReference(_)
            | RunChild::Shading(_) => "".to_string(),
        })
        .collect::<String>()
}
/// Extract visible text from a DOCX file.
///
/// The extracted output is paragraph-oriented and joined with newline separators.
pub fn extract_text_from_path<P>(path: P) -> ApiResult<String>
where
    P: Into<PathBuf>,
{
    let path = path.into();
    let bytes = read(path.clone()).map_err(|why| eyre!("Failed to read DOCX file {} - {why}", path.display()))?;
    let docx = read_docx(&bytes).map_err(|why| eyre!("Failed to parse DOCX file {} - {why}", path.display()))?;
    let text = docx
        .document
        .children
        .iter()
        .filter_map(|child| match child {
            | DocumentChild::Paragraph(p) => {
                let text = extract_from_paragraph(p).trim().to_string();
                if text.is_empty() {
                    None
                } else {
                    Some(text)
                }
            }
            | DocumentChild::Table(t) => {
                let text = extract_from_table(t).trim().to_string();
                if text.is_empty() {
                    None
                } else {
                    Some(text)
                }
            }
            | _ => None,
        })
        .collect::<Vec<_>>()
        .join("\n");
    Ok(text)
}

#[cfg(test)]
mod tests;
Loading