diff options
Diffstat (limited to 'extract/src/docx.c')
-rw-r--r-- | extract/src/docx.c | 167 |
1 files changed, 85 insertions, 82 deletions
diff --git a/extract/src/docx.c b/extract/src/docx.c index 761de176..ca6c5d78 100644 --- a/extract/src/docx.c +++ b/extract/src/docx.c @@ -95,7 +95,7 @@ static int s_docx_paragraph_empty(extract_alloc_t* alloc, extract_astring_t* con content_state.font.size = 10; content_state.font.bold = 0; content_state.font.italic = 0; - + if (s_docx_run_start(alloc, content, &content_state)) goto end; //docx_char_append_string(content, " "); /*   is non-break space. */ if (s_docx_run_finish(alloc, NULL /*state*/, content)) goto end; @@ -168,9 +168,9 @@ font. */ if (s_docx_run_finish(alloc, content_state, content)) goto end; } if (s_docx_paragraph_finish(alloc, content)) goto end; - + e = 0; - + end: return e; } @@ -245,7 +245,7 @@ static int s_docx_append_image( static int s_docx_output_rotated_paragraphs( extract_alloc_t* alloc, - extract_page_t* page, + subpage_t* subpage, int paragraph_begin, int paragraph_end, int rot, @@ -330,7 +330,7 @@ static int s_docx_output_rotated_paragraphs( /* Output paragraphs p0..p2-1. */ for (p=paragraph_begin; p<paragraph_end; ++p) { - paragraph_t* paragraph = page->paragraphs[p]; + paragraph_t* paragraph = subpage->paragraphs[p]; if (s_document_to_docx_content_paragraph(alloc, state, paragraph, content)) goto end; } @@ -364,7 +364,7 @@ static int s_docx_output_rotated_paragraphs( extract_astring_cat(alloc, content, " <w:txbxContent>"); for (p=paragraph_begin; p<paragraph_end; ++p) { - paragraph_t* paragraph = page->paragraphs[p]; + paragraph_t* paragraph = subpage->paragraphs[p]; if (s_document_to_docx_content_paragraph(alloc, state, paragraph, content)) goto end; } @@ -392,7 +392,7 @@ to the application. */ { int e = -1; int y; - + if (extract_astring_cat(alloc, content, "\n" " <w:tbl>\n" @@ -406,14 +406,14 @@ to the application. */ " <w:tr>\n" " <w:trPr/>\n" )) goto end; - + for (x=0; x<table->cells_num_x; ++x) { cell_t* cell = table->cells[y*table->cells_num_x + x]; if (!cell->left) continue; - + if (extract_astring_cat(alloc, content, " <w:tc>\n")) goto end; - + /* Write cell properties. */ { if (extract_astring_cat(alloc, content, @@ -442,7 +442,7 @@ to the application. */ } if (extract_astring_cat(alloc, content, " </w:tcPr>\n")) goto end; } - + /* Write contents of this cell. */ { size_t chars_num_old = content->chars_num; @@ -476,20 +476,20 @@ to the application. */ } if (extract_astring_cat(alloc, content, " </w:tbl>\n")) goto end; e = 0; - + end: return e; } static int s_docx_append_rotated_paragraphs( extract_alloc_t* alloc, - extract_page_t* page, + subpage_t* subpage, content_state_t* state, int* p, int* text_box_id, const matrix_t* ctm, double rotate, - extract_astring_t* content + extract_astring_t* output ) /* Appends paragraphs with same rotation, starting with page->paragraphs[*p] and updates *p. */ @@ -501,8 +501,8 @@ and updates *p. */ point_t extent = {0, 0}; int p0 = *p; int p1; - paragraph_t* paragraph = page->paragraphs[*p]; - + paragraph_t* paragraph = subpage->paragraphs[*p]; + outf("rotate=%.2frad=%.1fdeg ctm: ef=(%f %f) abcd=(%f %f %f %f)", rotate, rotate * 180 / pi, ctm->e, @@ -535,8 +535,8 @@ and updates *p. */ ctm->a, ctm->b, ctm->c, ctm->d); } - for (*p=p0; *p<page->paragraphs_num; ++(*p)) { - paragraph = page->paragraphs[*p]; + for (*p=p0; *p<subpage->paragraphs_num; ++(*p)) { + paragraph = subpage->paragraphs[*p]; ctm = ¶graph->lines[0]->spans[0]->ctm; rotate = atan2(ctm->b, ctm->a); if (rotate != rotate0) { @@ -625,13 +625,13 @@ and updates *p. */ x -= dx; y -= -dy; - if (s_docx_output_rotated_paragraphs(alloc, page, p0, p1, rot, x, y, w, h, *text_box_id, content, state)) goto end; + if (s_docx_output_rotated_paragraphs(alloc, subpage, p0, p1, rot, x, y, w, h, *text_box_id, output, state)) goto end; } *p = p1 - 1; e = 0; - + end: - + return e; } @@ -647,38 +647,40 @@ int extract_document_to_docx_content( int ret = -1; int text_box_id = 0; int p; - + /* Write paragraphs into <content>. */ for (p=0; p<document->pages_num; ++p) { extract_page_t* page = document->pages[p]; - - int p = 0; - int t = 0; - - content_state_t content_state; - content_state.font.name = NULL; - content_state.font.size = 0; - content_state.font.bold = 0; - content_state.font.italic = 0; - content_state.ctm_prev = NULL; - - /* Output paragraphs and tables in order of y coordinate. */ - for(;;) - { - paragraph_t* paragraph = (p == page->paragraphs_num) ? NULL : page->paragraphs[p]; - table_t* table = (t == page->tables_num) ? NULL : page->tables[t]; - double y_paragraph; - double y_table; - if (!paragraph && !table) break; - y_paragraph = (paragraph) ? paragraph->lines[0]->spans[0]->chars[0].y : DBL_MAX; - y_table = (table) ? table->pos.y : DBL_MAX; - - if (paragraph && y_paragraph < y_table) - { - const matrix_t* ctm = ¶graph->lines[0]->spans[0]->ctm; - double rotate = atan2(ctm->b, ctm->a); - - if (spacing + int c; + + for (c=0; c<page->subpages_num; ++c) { + subpage_t* subpage = page->subpages[c]; + + int p = 0; + int t = 0; + + content_state_t content_state; + content_state.font.name = NULL; + content_state.font.size = 0; + content_state.font.bold = 0; + content_state.font.italic = 0; + content_state.ctm_prev = NULL; + + /* Output paragraphs and tables in order of y coordinate. */ + for(;;) { + paragraph_t* paragraph = (p == subpage->paragraphs_num) ? NULL : subpage->paragraphs[p]; + table_t* table = (t == subpage->tables_num) ? NULL : subpage->tables[t]; + double y_paragraph; + double y_table; + if (!paragraph && !table) break; + y_paragraph = (paragraph) ? paragraph->lines[0]->spans[0]->chars[0].y : DBL_MAX; + y_table = (table) ? table->pos.y : DBL_MAX; + + if (paragraph && y_paragraph < y_table) { + const matrix_t* ctm = ¶graph->lines[0]->spans[0]->ctm; + double rotate = atan2(ctm->b, ctm->a); + + if (spacing && content_state.ctm_prev && paragraph->lines_num && paragraph->lines[0]->spans_num @@ -687,37 +689,38 @@ int extract_document_to_docx_content( ¶graph->lines[0]->spans[0]->ctm ) ) { - /* Extra vertical space between paragraphs that were at - different angles in the original document. */ - if (s_docx_paragraph_empty(alloc, content)) goto end; - } + /* Extra vertical space between paragraphs that were at + different angles in the original document. */ + if (s_docx_paragraph_empty(alloc, content)) goto end; + } - if (spacing) { - /* Extra vertical space between paragraphs. */ - if (s_docx_paragraph_empty(alloc, content)) goto end; - } + if (spacing) { + /* Extra vertical space between paragraphs. */ + if (s_docx_paragraph_empty(alloc, content)) goto end; + } - if (rotation && rotate != 0) - { - if (s_docx_append_rotated_paragraphs(alloc, page, &content_state, &p, &text_box_id, ctm, rotate, content)) goto end; + if (rotation && rotate != 0) + { + if (s_docx_append_rotated_paragraphs(alloc, subpage, &content_state, &p, &text_box_id, ctm, rotate, content)) goto end; + } + else + { + if (s_document_to_docx_content_paragraph(alloc, &content_state, paragraph, content)) goto end; + } + p += 1; } - else + else if (table) { - if (s_document_to_docx_content_paragraph(alloc, &content_state, paragraph, content)) goto end; + if (s_docx_append_table(alloc, table, content)) goto end; + t += 1; } - p += 1; - } - else if (table) - { - if (s_docx_append_table(alloc, table, content)) goto end; - t += 1; } - } - - if (images) { - int i; - for (i=0; i<page->images_num; ++i) { - s_docx_append_image(alloc, content, &page->images[i]); + + if (images) { + int i; + for (i=0; i<subpage->images_num; ++i) { + s_docx_append_image(alloc, content, &subpage->images[i]); + } } } } @@ -759,7 +762,7 @@ int extract_docx_content_item( extract_astring_t temp; extract_astring_init(&temp); *text2 = NULL; - + if (0) {} else if (!strcmp(name, "[Content_Types].xml")) { @@ -841,7 +844,7 @@ int extract_docx_content_item( return e; } - + int extract_docx_write_template( extract_alloc_t* alloc, @@ -862,7 +865,7 @@ int extract_docx_write_template( assert(path_out); assert(path_template); - + if (extract_check_path_shell_safe(path_out)) { outf("path_out is unsafe: %s", path_out); goto end; @@ -889,7 +892,7 @@ int extract_docx_write_template( /* Might be nice to iterate through all items in path_tempdir, but for now we look at just the items that we know extract_docx_content_item() will modify. */ - + { const char* names[] = { "word/document.xml", @@ -904,7 +907,7 @@ int extract_docx_write_template( extract_free(alloc, &text2); if (extract_asprintf(alloc, &path, "%s/%s", path_tempdir, name) < 0) goto end; if (extract_read_all_path(alloc, path, &text)) goto end; - + if (extract_docx_content_item( alloc, contentss, @@ -926,14 +929,14 @@ int extract_docx_write_template( extract_free(alloc, &path); if (extract_asprintf(alloc, &path, "%s/word/media", path_tempdir) < 0) goto end; if (extract_mkdir(path, 0777)) goto end; - + for (i=0; i<images->images_num; ++i) { image_t* image = &images->images[i]; extract_free(alloc, &path); if (extract_asprintf(alloc, &path, "%s/word/media/%s", path_tempdir, image->name) < 0) goto end; if (extract_write_all(image->data, image->data_size, path)) goto end; } - + outf("Zipping tempdir to create %s", path_out); { const char* path_out_leaf = strrchr(path_out, '/'); |