extract images from a pdf
pdimg_images(paths, base_dir = NULL, ...)
(character) path to a pdf, required
(character) the base path to collect files into. if NULL
(default), we use a temp directory
additional command line args passed on to pdfimages
. See
pdimg_help()
for docs
data.frames of metadata on images in the pdf. if the path is not found or the path is found but no images are found, then a warning is thrown and a zero row data.frame is returned
by default we use temp dir to store extracted images - at the end
of an R session these are cleaned up (deleted). to store images
after the R session ends use base_dir
# images found
x <- system.file("examples/BachmanEtal2020.pdf", package="pdfimager")
res <- pdimg_images(x)
res
#> [[1]]
#> # A tibble: 3 × 17
#> path page num type width height color comp bpc enc interp object
#> <chr> <int> <int> <chr> <int> <int> <chr> <int> <int> <chr> <chr> <int>
#> 1 /var/fol… 5 0 image 1024 573 rgb 3 8 jpeg yes 178
#> 2 /var/fol… 8 1 image 1024 1001 rgb 3 8 jpeg yes 146
#> 3 /var/fol… 11 2 image 1024 988 rgb 3 8 jpeg yes 110
#> # ℹ 5 more variables: ID <int>, x.ppi <int>, y.ppi <int>, size <chr>,
#> # ratio <chr>
#>
res[[1]]$path
#> [1] "/var/folders/py/lcjn3y352g1106vf1rqk521r0000gn/T//RtmpQVrOR8/BachmanEtal2020/img-000.ppm"
#> [2] "/var/folders/py/lcjn3y352g1106vf1rqk521r0000gn/T//RtmpQVrOR8/BachmanEtal2020/img-001.ppm"
#> [3] "/var/folders/py/lcjn3y352g1106vf1rqk521r0000gn/T//RtmpQVrOR8/BachmanEtal2020/img-002.ppm"
file.exists(res[[1]]$path[1])
#> [1] TRUE
z <- system.file("examples/Tierney2017JOSS.pdf", package="pdfimager")
pdimg_images(z)
#> [[1]]
#> # A tibble: 6 × 17
#> path page num type width height color comp bpc enc interp object
#> <chr> <int> <int> <chr> <int> <int> <chr> <int> <int> <chr> <chr> <int>
#> 1 /var/fol… 1 0 image 650 249 rgb 3 8 image no 6
#> 2 /var/fol… 1 1 smask 650 249 gray 1 8 image no 6
#> 3 /var/fol… 2 2 image 650 249 rgb 3 8 image no 6
#> 4 /var/fol… 2 3 smask 650 249 gray 1 8 image no 6
#> 5 /var/fol… 2 4 image 672 480 icc 3 8 image no 41
#> 6 /var/fol… 2 5 smask 672 480 gray 1 8 image no 41
#> # ℹ 5 more variables: ID <int>, x.ppi <int>, y.ppi <int>, size <chr>,
#> # ratio <chr>
#>
# change base directory to hold extracted images
x <- system.file("examples/BachmanEtal2020.pdf", package="pdfimager")
dir <- file.path(tempdir(), "bluetree")
res <- pdimg_images(x, base_dir = dir)
# many at once
pdimg_images(c(x, z))
#> [[1]]
#> # A tibble: 3 × 17
#> path page num type width height color comp bpc enc interp object
#> <chr> <int> <int> <chr> <int> <int> <chr> <int> <int> <chr> <chr> <int>
#> 1 /var/fol… 5 0 image 1024 573 rgb 3 8 jpeg yes 178
#> 2 /var/fol… 8 1 image 1024 1001 rgb 3 8 jpeg yes 146
#> 3 /var/fol… 11 2 image 1024 988 rgb 3 8 jpeg yes 110
#> # ℹ 5 more variables: ID <int>, x.ppi <int>, y.ppi <int>, size <chr>,
#> # ratio <chr>
#>
#> [[2]]
#> # A tibble: 6 × 17
#> path page num type width height color comp bpc enc interp object
#> <chr> <int> <int> <chr> <int> <int> <chr> <int> <int> <chr> <chr> <int>
#> 1 /var/fol… 1 0 image 650 249 rgb 3 8 image no 6
#> 2 /var/fol… 1 1 smask 650 249 gray 1 8 image no 6
#> 3 /var/fol… 2 2 image 650 249 rgb 3 8 image no 6
#> 4 /var/fol… 2 3 smask 650 249 gray 1 8 image no 6
#> 5 /var/fol… 2 4 image 672 480 icc 3 8 image no 41
#> 6 /var/fol… 2 5 smask 672 480 gray 1 8 image no 41
#> # ℹ 5 more variables: ID <int>, x.ppi <int>, y.ppi <int>, size <chr>,
#> # ratio <chr>
#>
## pass custom dir to each path
pdimg_images(c(x, z), file.path(tempdir(), "pepperjackcheese"))
#> [[1]]
#> # A tibble: 3 × 17
#> path page num type width height color comp bpc enc interp object
#> <chr> <int> <int> <chr> <int> <int> <chr> <int> <int> <chr> <chr> <int>
#> 1 /var/fol… 5 0 image 1024 573 rgb 3 8 jpeg yes 178
#> 2 /var/fol… 8 1 image 1024 1001 rgb 3 8 jpeg yes 146
#> 3 /var/fol… 11 2 image 1024 988 rgb 3 8 jpeg yes 110
#> # ℹ 5 more variables: ID <int>, x.ppi <int>, y.ppi <int>, size <chr>,
#> # ratio <chr>
#>
#> [[2]]
#> # A tibble: 6 × 17
#> path page num type width height color comp bpc enc interp object
#> <chr> <int> <int> <chr> <int> <int> <chr> <int> <int> <chr> <chr> <int>
#> 1 /var/fol… 1 0 image 650 249 rgb 3 8 image no 6
#> 2 /var/fol… 1 1 smask 650 249 gray 1 8 image no 6
#> 3 /var/fol… 2 2 image 650 249 rgb 3 8 image no 6
#> 4 /var/fol… 2 3 smask 650 249 gray 1 8 image no 6
#> 5 /var/fol… 2 4 image 672 480 icc 3 8 image no 41
#> 6 /var/fol… 2 5 smask 672 480 gray 1 8 image no 41
#> # ℹ 5 more variables: ID <int>, x.ppi <int>, y.ppi <int>, size <chr>,
#> # ratio <chr>
#>
# no images found, but there are actually images
d <- system.file("examples/LahtiEtal2017.pdf", package="pdfimager")
pdimg_images(d)
#> Warning: no images found in pdf
#> [[1]]
#> # A tibble: 0 × 0
#>
# no images found, and there really are no images
w <- system.file("examples/White2015.pdf", package="pdfimager")
pdimg_images(w)
#> Warning: no images found in pdf
#> [[1]]
#> # A tibble: 0 × 0
#>
# path not found
pdimg_images("foo-bar")
#> Warning: path 'foo-bar' does not exist
#> [[1]]
#> # A tibble: 0 × 0
#>
# only gets overlayed smaller images on plots, doesn't get plots
# themselves
g <- system.file("examples/vanGemert2018.pdf", package="pdfimager")
pdimg_images(g)
#> [[1]]
#> # A tibble: 8 × 17
#> path page num type width height color comp bpc enc interp object
#> <chr> <int> <int> <chr> <int> <int> <chr> <int> <int> <chr> <chr> <int>
#> 1 /var/fol… 1 0 image 1491 256 cmyk 4 8 jpeg no 352
#> 2 /var/fol… 3 1 image 121 53 sep 1 8 jpeg no 84
#> 3 /var/fol… 3 2 image 114 86 sep 1 8 jpeg no 83
#> 4 /var/fol… 3 3 image 108 39 sep 1 8 jpeg no 82
#> 5 /var/fol… 5 4 image 25 117 sep 1 8 image no 141
#> 6 /var/fol… 5 5 image 17 34 sep 1 8 image no 140
#> 7 /var/fol… 5 6 image 19 91 sep 1 8 image no 139
#> 8 /var/fol… 5 7 image 18 69 sep 1 8 image no 138
#> # ℹ 5 more variables: ID <int>, x.ppi <int>, y.ppi <int>, size <chr>,
#> # ratio <chr>
#>
# number of images doesn't match number of rows of metadata
## so we fix internally by removing duplicate files for same image
h <- system.file("examples/SanyalEtal2018.pdf", package="pdfimager")
pdimg_images(h)
#> [[1]]
#> # A tibble: 52 × 17
#> path page num type width height color comp bpc enc interp object
#> <chr> <int> <int> <chr> <int> <int> <chr> <int> <int> <chr> <chr> <int>
#> 1 /var/fo… 1 0 image 486 221 cmyk 4 8 jpeg no 227
#> 2 /var/fo… 1 1 sten… 2744 974 - 1 1 ccitt no 229
#> 3 /var/fo… 1 2 image 178 148 cmyk 4 8 jpeg no 231
#> 4 /var/fo… 5 3 image 980 946 rgb 3 8 jpeg no 36
#> 5 /var/fo… 6 4 image 82 76 index 1 8 image no 87
#> 6 /var/fo… 6 5 image 82 76 index 1 8 image no 88
#> 7 /var/fo… 6 6 image 82 76 index 1 8 image no 98
#> 8 /var/fo… 6 7 image 82 76 index 1 8 image no 104
#> 9 /var/fo… 6 8 image 82 76 index 1 8 image no 118
#> 10 /var/fo… 6 9 image 82 76 index 1 8 image no 124
#> # ℹ 42 more rows
#> # ℹ 5 more variables: ID <int>, x.ppi <int>, y.ppi <int>, size <chr>,
#> # ratio <chr>
#>
# convert to another format - not entirely sure how this works,
# it's not documented
j <- system.file("examples/SanyalEtal2018.pdf", package="pdfimager")
pdimg_images(j, format = "-png")[[1]]$path[1:5]
#> [1] "/var/folders/py/lcjn3y352g1106vf1rqk521r0000gn/T//RtmpQVrOR8/SanyalEtal2018/img-000.png"
#> [2] "/var/folders/py/lcjn3y352g1106vf1rqk521r0000gn/T//RtmpQVrOR8/SanyalEtal2018/img-001.pbm"
#> [3] "/var/folders/py/lcjn3y352g1106vf1rqk521r0000gn/T//RtmpQVrOR8/SanyalEtal2018/img-002.png"
#> [4] "/var/folders/py/lcjn3y352g1106vf1rqk521r0000gn/T//RtmpQVrOR8/SanyalEtal2018/img-003.png"
#> [5] "/var/folders/py/lcjn3y352g1106vf1rqk521r0000gn/T//RtmpQVrOR8/SanyalEtal2018/img-004.png"