pdf.js实现多个不同词的高亮显示

pdf.js

PDF Reader in JavaScript

项目地址：https://gitcode.com/gh_mirrors/pd/pdf.js

免费下载资源

NEO_L

9875人浏览 · 2018-07-30 16:23:32

NEO_L · 2018-07-30 16:23:32 发布

这里我是没有进行压缩过的版本进行修改，该版本目录分析参考：https://blog.csdn.net/a973685825/article/details/81285288

目标：实现多个词的高亮显示

原理：通过对pdf.js本身提供的搜索功能（即ctrl+f）进行修改来达到多个词的高亮显示

注意：经过实践证明，多个词的高亮我所采用的这种方法只能在文档加载的进行一次性高亮，由于我的方法是在搜索功能上进行修改的，故有很多限制，理论上可以自己写一个扩展类来实现这个功能，但难度比较大，我是个小白写不出来。

web目录下是很多扩展功能的js文件，我们主要是修改里面的js文件：

view.js是入口文件，即整个pdf文档加载的入口js文件

viewer.html 即展示页面

app_option.js 即整个的参数配置文件

app.js 即整个程序所有函数的执行处,类似于java中的main函数处

其他的js文件主要就是pdf.js所提供的一些扩展功能，如搜索功能，下载功能等等，而我需要修改的就是这些文件的其中一小部分

这里我主要关注三个文件：app.js 、pdf_find_controller.js

实现过程概述：pdf.js将文本层转为html后，这样所有的文本就被div包裹起来了，这些div在也页面中都是有序的。它首先将这些文本从div中提取出来进行整合成一个字符串，通过字符串匹配得到关键词的位置，然后将这些位置放在一个数组中，然后通过text_layer_builder.js中的convertMatches函数，将该位置进行转化成包含有div的字符串中的位置：比如：这个关键词的首位置在第几个div偏移多少等，这样就可以根据这个转化的位置进行添加span标签和类名，来对关键词背景进行高亮。

原本pdf.js只是计算了单个词位置，而我可以通过循环计算多个不同词的位置，存到数组里，然后转化，就可以对多个不同的词进行高亮

注意：我这是压缩版的pdf.js实际运行特别慢，所以你需要在压缩版的view.js（即官网提供的demo）找到对应的函数进行修改。另外最好把把pdf.js提供的搜索功能屏蔽掉，通过display=none即可，因为多个词的高亮和该搜索功能，不能同时使用，除非你自己单独写一个扩展类

1。首先：在app.js中，创建一个函数如：

function wordHighLight(hightLightWords) { // 目前只能匹配一个，不能全部高亮
  let evt = {
    // source: PDFFindBar, // PDFFindBar的实例，不确定是干嘛用的？
    type: '',  // 这里默认应该是空的
    // 这里能默认跳转到query的位置，刚好能满足要求
    query: hightLightWords, // 高亮的关键词
    phraseSearch: false, // 支持整段文字匹配,如果时多个词的匹配只能是false
    caseSensitive: false, // 默认为false,搜索时忽略大小写
    highlightAll: true, // 设为true即关键词全部高亮
    // findPrevious: true,
  };
  PDFViewerApplication.findController.executeCommand('find' + evt.type, {//搜索执行函数
    query: evt.query,
    phraseSearch: evt.phraseSearch,
    caseSensitive: evt.caseSensitive,
    highlightAll: evt.highlightAll,
    findPrevious: evt.findPrevious,
  });
}

2. wordHighLight函数的调用

找到：PDFFindBar类（即搜索功能类）实例化处，在下面调用wordHighLight

     this.findBar = new PDFFindBar(findBarConfig, this.l10n);// 实例化PDFFindBar
      // 这时我要是时点击keydown就会触发查找事件
      // 同样的，我可以在这里直接触发查找函数
      let highLightWords = ['云林街菜鸟物流园职工，非人大代表或政协委员，租住雄楚市雄楚区金港一号小区11栋504室', '李毅', '张昌', '聊城', '犯罪经历', '犯罪嫌疑人张昌'];
       wordHighLight(highLightWords);

3.因为executeComand函数是在pdf_find_controller.js中执行，

在pdf_find_controller.js中，找到executeCommand(cmd, state)——》_nextMatch()——》calculateMatch(pageIndex)函数，由于query以前是单个词搜索（pdf.js提供的功能），这里我需要对涉及到query的做一些循环处理如下（我只是做了一些循环处理,请根据具体代码进行修改）：

  _calculateMatch(pageIndex) {  // 计算结果都返回到了对象的属性里，所以不需要通过
    // return 来获取想要的值
    // _normalize应该是规范化的意思
    let pageContent = this._normalize(this.pageContents[pageIndex]);
    let query_words = this.state.query;
    for (let i = 0; i < query_words.length; i++) {
      query_words[i] = this._normalize(query_words[i]);
      let caseSensitive = this.state.caseSensitive;

      if (!caseSensitive) { // 判断是否对大小写敏感
        // 如果不区分大小写，就把页面内容全部转为小写
        // 这里pagecont
        pageContent = pageContent.toLowerCase();
        query_words[i] = query_words[i].toLowerCase();
      }
    }
    // let query = this._normalize(this.state.query); // 这里只传只传了一个词，我需要传多个词
    let phraseSearch = this.state.phraseSearch;
    // let queryLen = query.length;
    // 查询内容为空，返回
    if (query_words.length === 0) {
      // Do nothing: the matches should be wiped out already.
      return;
    }
    // 不区分大小写的话
    // 以上内容规范了当前页和查询内容的规范化
    // 以下为真正的匹配的内容
    if (phraseSearch) {  // 若为true则可匹配整段文字
      this._calculatePhraseMatch(query_words, pageIndex, pageContent); // 词组匹配功能
    } else { // 若为false则只能匹配单个的词，特征：单个的词一般 两边有空格
      this._calculateWordMatch(query_words, pageIndex, pageContent); // 单词匹配功能
    }
    // 将计算的匹配结果，用来更新匹配结果
    this._updatePage(pageIndex); // 清除以前的匹配结果，渲染最新的匹配结果
    if (this.resumePageIdx === pageIndex) { // 如果恢复的页面索引等于当前索引
      this.resumePageIdx = null; // 将恢复页面的索引设为空
      this._nextPageMatch(); //
    }

    // Update the match count.
    // 更新匹配个数
    // this.pageMatches[pageIndex]里记录了第pageIndex页匹配的关键词的位置
    // 通过下面这样就可以获得匹配的个数了
    // 问题：我在搜索时，没有全部高亮时，就可以显示全部匹配的单词个数
    // 那我为什么不能全部高亮呢
    if (this.pageMatches[pageIndex].length > 0) {
      this.matchCount += this.pageMatches[pageIndex].length;
      this._updateUIResultsCount();
    }
  }

然后切换到——》_calculateWordMatch()函数中做一些循环处理：

_calculateWordMatch(query, pageIndex, pageContent) { // 单词匹配
    let matchesWithLength = [];
    // Divide the query into pieces and search for text in each piece.
    // 把搜索的词分开，然后搜索分开的词，最小单位为字母
    // query.match 清除了搜索值的空格，返回了一个有着被搜索词的数组
    // 例：hello world -->"hello","world"
    let test_query = query;

    for (let x = 0; x < test_query.length; x++) {
      // 只有在这里才可以给段落设置一个sign,后面我才能进行匹配，滚动到该段落位置
      let queryArray = test_query[x].match(/\S+/g); // '/S':任何一个非空白字符
      for (let i = 0, len = queryArray.length; i < len; i++) {
        // 若果是英文单词，就是单个字母的循环
        let subquery = queryArray[i];
        let subqueryLen = subquery.length;
        let matchIdx = -subqueryLen; // 这里设为负数是什么意思
        while (true) {
          // matchInd+subqueryLen  == 0:为开始查找的位置
          // matchIdx 为返回的匹配的的位置
          matchIdx = pageContent.indexOf(subquery, matchIdx + subqueryLen);
          if (matchIdx === -1) { // 说明没有匹配到
            break;
          }
          // Other searches do not, so we store the length.
          matchesWithLength.push({ // 将搜索的索引存到 matchesWithLength里
            match: matchIdx,
            matchLength: subqueryLen,
            skipped: false,
          });
        }
      }
    }

    // Prepare arrays for storing the matches.
    if (!this.pageMatchesLength) {
      this.pageMatchesLength = []; // 清空这个要存储的数组
    }
    this.pageMatchesLength[pageIndex] = []; // 某一页的匹配结果，存到每一页上
    this.pageMatches[pageIndex] = [];

    // Sort `matchesWithLength`, remove intersecting terms and put the result
    // into the two arrays.
    // this.pageMatches[pageIndex] 存索引
    // this.pageMatchesLength[pageIndex] 存对应的长度
    // 还是和phrase一样，pageMatches用来存索引
    this._prepareMatches(matchesWithLength, this.pageMatches[pageIndex],
      this.pageMatchesLength[pageIndex]);
  }

效果截图展示：

GitHub 加速计划 / pd / pdf.js

47.48 K

9.86 K

下载

PDF Reader in JavaScript

最近提交(Master分支：3 个月前 )

18284815 [Editor] Update the disclaimer string in the new alt-text dialog (bug 1911738) 3 个月前

fc602c65 And tweak the css in order to take into account that disclaimer can be on two (or more lines). 3 个月前

GitCode 开源社区

旨在为数千万中国开发者提供一个无缝且高效的云端环境，以支持学习、使用和贡献开源项目。

更多推荐

[转载]在Windows环境下安装GNU Radio

转自：在Windows环境下安装GNURadio_恐弱智_新浪博客GNU Radio是用Python开发的，大部分开源的工程能够在Linux环境下运行良好，而Windows下却运行的很勉强，而且安装配置都很复杂。GNU Radio算是个例外了，不光提供了Windows的二进制安装，还有比较详细的说明。我是Python小白，所以折腾了好久才弄好，特意记录下来，免得以后再装还折腾。GNU Radio的

GitCode 开源社区

centOS 8 使用dnf安装Docker

DNF是什么？CentOS 8使用YUM软件包管理器版本v4.0.4。现在，该版本使用DNF(已删除YUM)。DNF是软件包管理器。它会在Linux发行版上安装，执行更新并删除软件包。使用DNF安装Docker跳过具有损坏依赖性的程序包一个有效的解决方案是使您的CentOS 8系统使用以下--nobest命令安装最符合条件的版本：sudo dnf install docker...

GitCode 开源社区

定时同步数据库表(mysql+linux+crontab)

sync.sh里面的参数需要改变，ip/username/password/database/tablesync.sh#!/bin/sh# Please change the IP and password of the data source db.# Then change the table name.filename=/home/nington/db/$(date +%Y-%m