aspose通过开始和结束位置关键词截取word另存为新文件

2024-01-02 13:32:17

?关键词匹配实体类:

@Data
@EqualsAndHashCode(callSuper = false)
public class TextConfig implements Serializable {

    private static final long serialVersionUID = 1L;

    
    /**
     * 开始关键词,多个逗号分隔
     */
    private String textStart ;

    /**
     * 结束关键词,多个逗号分隔
     */
    private String textEnd ;

    /**
     * 包含关键词,多个逗号分隔
     */
    private String textInclude ;

    /**
     * 不包含关键词,多个逗号分隔
     */
    private String textExclude ;

}
import com.aspose.words.*;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.io.FileUtils;
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.*;


@EqualsAndHashCode(callSuper = false)
@Slf4j
@Data
public class WordResolve extends Document {
    /**
     * 查找文本类型另存为word
     * @param filePathName  文件保存路径
     * @param copyFirst     是否复制关键词开始节点
     * @param copyLast      是否复制关键词结束节点
     */
    @SneakyThrows
    public File findBetweenFile(TextConfig textConfig, String filePathName, boolean copyFirst, boolean copyLast) {
        List<Paragraph> paragraphs = getAllParagraph();
        Integer[] sec = findBetweenIndex(textConfig,getAllText());
        if(sec == null){
            return null;
        }
        Document doc = new Document();
        Body body = doc.getFirstSection().getBody();
        body.removeAllChildren();
        NodeImporter importer = new NodeImporter(this, doc, ImportFormatMode.KEEP_SOURCE_FORMATTING);
        Paragraph first = paragraphs.get(sec[0]);
        Paragraph last = paragraphs.get(sec[1]);

        List<CompositeNode<?>> parentNodes = new ArrayList<>(20);

        boolean startCopying = false;
        //读取文档的所有节点
        NodeCollection<?> allNodeList = this.getChildNodes(NodeType.ANY, true);
        for (int i = 0, j = allNodeList.getCount(); i < j; i++) {
            Node node = allNodeList.get(i);
            try{
                if (node == first) {
                    startCopying = true;
                    if(!copyFirst){
                        continue;
                    }
                }
                if (node == last && !copyLast) {
                    // 到达结束节点后停止复制
                    break;
                }
                if (startCopying) {
                    boolean append = true;
                    for (CompositeNode<?> parentNode : parentNodes) {
                        NodeCollection<?> childNodes = parentNode.getChildNodes(node.getNodeType(), true);
                        if(childNodes.contains(node)){
                            append = false;
                            break;
                        }
                    }
                    if(append){
                        try{
                            body.appendChild(importer.importNode(node, true));
                        }catch (Exception e){
                            log.error("插入节点出错:{}",e.getMessage());
                            //ignore
                        }
                    }
                }
                if (node == last) {
                    // 到达结束节点后停止复制
                    break;
                }
            }finally {
                if(startCopying && node.isComposite()){
                    CompositeNode<?> compositeNode = (CompositeNode<?>) node;
                    if(!parentNodes.contains(compositeNode)){
                        parentNodes.add(compositeNode);
                    }
                }
            }
        }
        File file = FileUtils.getFile(filePathName);
        doc.save(filePathName);
        return file;
    }

    /**
     * 查找文本类型解析规则的开始结束段落索引
     * @param   strings word全部段落,每个段落的文本
     */
    public Integer[] findBetweenIndex(TextConfig textConfig, List<String> strings){
        String textStart = textConfig.getTextStart();
        String textEnd = textConfig.getTextEnd();
        //规定开始关键词必须包含哪些文字,多个逗号分隔(作为附加判断,可为空)
        textInclude = StringUtil.defaultString(textConfig.getTextInclude(), "").replace(",", "");
        //规定结束关键词必须不包含哪些文字,多个逗号分隔(作为附加判断,可为空)
        textExclude = StringUtil.defaultString(textConfig.getTextExclude(), "").replace(",", "");
        String[] in = StringUtil.isBlank(textInclude) ? null : textInclude.split(",");
        String[] out = StringUtil.isBlank(textExclude) ? null : textExclude.split(",");
        //满足开始位置和结束位置的全部关键词索引
        List<Integer> startArr = new ArrayList<>();
        List<Integer> endArr = new ArrayList<>();

        for (int i = 0; i < strings.size(); i++) {
            String text = strings.get(i);
            if (text.contains(textStart)) {
                startArr.add(i);
            }
            if (text.contains(textEnd)) {
                endArr.add(i);
            }
        }
        //进行包含和非包含的判断过滤
        if(!startArr.isEmpty() && !endArr.isEmpty()){
            for (Integer start : startArr) {
                for (Integer end : endArr) {
                    //中间至少隔了一个段落
                    if(start + 1 < end){
                        StringJoiner jo = new StringJoiner("\n");
                        for (int i = start + 1; i < end; i++) {
                            jo.add(strings.get(i));
                        }
                        String word = jo.toString();
                        boolean match = true;
                        if(in != null){
                            for (String s : in) {
                                if(!word.contains(s)){
                                    match = false;
                                    break;
                                }
                            }
                        }
                        if(match && out != null){
                            for (String s : out) {
                                if(word.contains(s)){
                                    match = false;
                                    break;
                                }
                            }
                        }
                        if(match){
                            return new Integer[]{start,end};
                        }
                    }
                }
            }
        }
        return null;
    }

    /**
     * 拿到文档全部段落文本
     */
    public List<String> getAllText(){
        List<String> strings = new ArrayList<>();
        getAllParagraph().forEach(a-> strings.add(StringTool.safeToString(a.getText(), "")));
        return strings;
    }

    /**
     * 拿到文档全部段落
     */
    public List<Paragraph> getAllParagraph(){
        if(allParagraph == null){
            allParagraph = findNodeByType(NodeType.PARAGRAPH, Paragraph.class);
        }
        return allParagraph;
    }
}

使用方式:

word文档内容如下:

要截取标题三和标题四之间的内容,生成新的word,代码如下:

public class Test {

    public static void main(String[] args) throws Exception{
        //验证license
        //new AsposeLicense().validate();
        //读取word文件
        WordResolve word = new WordResolve(new File("C:\\Users\\zhou\\Desktop\\测试.docx"));
        TextConfig detail = new TextConfig();
        detail.setTextStart("poi导出大数据量问题、写入速度慢");
        detail.setTextEnd("国密验签失败");
        File betweenFile = word.findBetweenFile(detail, "C:\\Users\\zhou\\Desktop\\betweenFile.docx", false, false);
        System.out.println(betweenFile);
    }


}

?截取保存的文件如下:

文章来源:https://blog.csdn.net/qq_36635569/article/details/135337061
本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。