extract images from a pdf

pdimg_images(paths, base_dir = NULL, ...)

Arguments

paths

(character) path to a pdf, required

base_dir

(character) the base path to collect files into. if NULL (default), we use a temp directory

...

additional command line args passed on to pdfimages. See pdimg_help() for docs

Value

data.frames of metadata on images in the pdf. if the path is not found or the path is found but no images are found, then a warning is thrown and a zero row data.frame is returned

Note

by default we use temp dir to store extracted images - at the end of an R session these are cleaned up (deleted). to store images after the R session ends use base_dir

Examples

# images found
x <- system.file("examples/BachmanEtal2020.pdf", package="pdfimager")
res <- pdimg_images(x)
res
#> [[1]]
#> # A tibble: 3 × 17
#>   path       page   num type  width height color  comp   bpc enc   interp object
#>   <chr>     <int> <int> <chr> <int>  <int> <chr> <int> <int> <chr> <chr>   <int>
#> 1 /var/fol…     5     0 image  1024    573 rgb       3     8 jpeg  yes       178
#> 2 /var/fol…     8     1 image  1024   1001 rgb       3     8 jpeg  yes       146
#> 3 /var/fol…    11     2 image  1024    988 rgb       3     8 jpeg  yes       110
#> # ℹ 5 more variables: ID <int>, x.ppi <int>, y.ppi <int>, size <chr>,
#> #   ratio <chr>
#> 
res[[1]]$path
#> [1] "/var/folders/py/lcjn3y352g1106vf1rqk521r0000gn/T//RtmpQVrOR8/BachmanEtal2020/img-000.ppm"
#> [2] "/var/folders/py/lcjn3y352g1106vf1rqk521r0000gn/T//RtmpQVrOR8/BachmanEtal2020/img-001.ppm"
#> [3] "/var/folders/py/lcjn3y352g1106vf1rqk521r0000gn/T//RtmpQVrOR8/BachmanEtal2020/img-002.ppm"
file.exists(res[[1]]$path[1])
#> [1] TRUE

z <- system.file("examples/Tierney2017JOSS.pdf", package="pdfimager")
pdimg_images(z)
#> [[1]]
#> # A tibble: 6 × 17
#>   path       page   num type  width height color  comp   bpc enc   interp object
#>   <chr>     <int> <int> <chr> <int>  <int> <chr> <int> <int> <chr> <chr>   <int>
#> 1 /var/fol…     1     0 image   650    249 rgb       3     8 image no          6
#> 2 /var/fol…     1     1 smask   650    249 gray      1     8 image no          6
#> 3 /var/fol…     2     2 image   650    249 rgb       3     8 image no          6
#> 4 /var/fol…     2     3 smask   650    249 gray      1     8 image no          6
#> 5 /var/fol…     2     4 image   672    480 icc       3     8 image no         41
#> 6 /var/fol…     2     5 smask   672    480 gray      1     8 image no         41
#> # ℹ 5 more variables: ID <int>, x.ppi <int>, y.ppi <int>, size <chr>,
#> #   ratio <chr>
#> 

# change base directory to hold extracted images
x <- system.file("examples/BachmanEtal2020.pdf", package="pdfimager")
dir <- file.path(tempdir(), "bluetree")
res <- pdimg_images(x, base_dir = dir)

# many at once
pdimg_images(c(x, z))
#> [[1]]
#> # A tibble: 3 × 17
#>   path       page   num type  width height color  comp   bpc enc   interp object
#>   <chr>     <int> <int> <chr> <int>  <int> <chr> <int> <int> <chr> <chr>   <int>
#> 1 /var/fol…     5     0 image  1024    573 rgb       3     8 jpeg  yes       178
#> 2 /var/fol…     8     1 image  1024   1001 rgb       3     8 jpeg  yes       146
#> 3 /var/fol…    11     2 image  1024    988 rgb       3     8 jpeg  yes       110
#> # ℹ 5 more variables: ID <int>, x.ppi <int>, y.ppi <int>, size <chr>,
#> #   ratio <chr>
#> 
#> [[2]]
#> # A tibble: 6 × 17
#>   path       page   num type  width height color  comp   bpc enc   interp object
#>   <chr>     <int> <int> <chr> <int>  <int> <chr> <int> <int> <chr> <chr>   <int>
#> 1 /var/fol…     1     0 image   650    249 rgb       3     8 image no          6
#> 2 /var/fol…     1     1 smask   650    249 gray      1     8 image no          6
#> 3 /var/fol…     2     2 image   650    249 rgb       3     8 image no          6
#> 4 /var/fol…     2     3 smask   650    249 gray      1     8 image no          6
#> 5 /var/fol…     2     4 image   672    480 icc       3     8 image no         41
#> 6 /var/fol…     2     5 smask   672    480 gray      1     8 image no         41
#> # ℹ 5 more variables: ID <int>, x.ppi <int>, y.ppi <int>, size <chr>,
#> #   ratio <chr>
#> 
## pass custom dir to each path
pdimg_images(c(x, z), file.path(tempdir(), "pepperjackcheese"))
#> [[1]]
#> # A tibble: 3 × 17
#>   path       page   num type  width height color  comp   bpc enc   interp object
#>   <chr>     <int> <int> <chr> <int>  <int> <chr> <int> <int> <chr> <chr>   <int>
#> 1 /var/fol…     5     0 image  1024    573 rgb       3     8 jpeg  yes       178
#> 2 /var/fol…     8     1 image  1024   1001 rgb       3     8 jpeg  yes       146
#> 3 /var/fol…    11     2 image  1024    988 rgb       3     8 jpeg  yes       110
#> # ℹ 5 more variables: ID <int>, x.ppi <int>, y.ppi <int>, size <chr>,
#> #   ratio <chr>
#> 
#> [[2]]
#> # A tibble: 6 × 17
#>   path       page   num type  width height color  comp   bpc enc   interp object
#>   <chr>     <int> <int> <chr> <int>  <int> <chr> <int> <int> <chr> <chr>   <int>
#> 1 /var/fol…     1     0 image   650    249 rgb       3     8 image no          6
#> 2 /var/fol…     1     1 smask   650    249 gray      1     8 image no          6
#> 3 /var/fol…     2     2 image   650    249 rgb       3     8 image no          6
#> 4 /var/fol…     2     3 smask   650    249 gray      1     8 image no          6
#> 5 /var/fol…     2     4 image   672    480 icc       3     8 image no         41
#> 6 /var/fol…     2     5 smask   672    480 gray      1     8 image no         41
#> # ℹ 5 more variables: ID <int>, x.ppi <int>, y.ppi <int>, size <chr>,
#> #   ratio <chr>
#> 

# no images found, but there are actually images 
d <- system.file("examples/LahtiEtal2017.pdf", package="pdfimager")
pdimg_images(d)
#> Warning: no images found in pdf
#> [[1]]
#> # A tibble: 0 × 0
#> 

# no images found, and there really are no images
w <- system.file("examples/White2015.pdf", package="pdfimager")
pdimg_images(w)
#> Warning: no images found in pdf
#> [[1]]
#> # A tibble: 0 × 0
#> 

# path not found
pdimg_images("foo-bar")
#> Warning: path 'foo-bar' does not exist
#> [[1]]
#> # A tibble: 0 × 0
#> 

# only gets overlayed smaller images on plots, doesn't get plots
# themselves
g <- system.file("examples/vanGemert2018.pdf", package="pdfimager")
pdimg_images(g)
#> [[1]]
#> # A tibble: 8 × 17
#>   path       page   num type  width height color  comp   bpc enc   interp object
#>   <chr>     <int> <int> <chr> <int>  <int> <chr> <int> <int> <chr> <chr>   <int>
#> 1 /var/fol…     1     0 image  1491    256 cmyk      4     8 jpeg  no        352
#> 2 /var/fol…     3     1 image   121     53 sep       1     8 jpeg  no         84
#> 3 /var/fol…     3     2 image   114     86 sep       1     8 jpeg  no         83
#> 4 /var/fol…     3     3 image   108     39 sep       1     8 jpeg  no         82
#> 5 /var/fol…     5     4 image    25    117 sep       1     8 image no        141
#> 6 /var/fol…     5     5 image    17     34 sep       1     8 image no        140
#> 7 /var/fol…     5     6 image    19     91 sep       1     8 image no        139
#> 8 /var/fol…     5     7 image    18     69 sep       1     8 image no        138
#> # ℹ 5 more variables: ID <int>, x.ppi <int>, y.ppi <int>, size <chr>,
#> #   ratio <chr>
#> 

# number of images doesn't match number of rows of metadata
## so we fix internally by removing duplicate files for same image
h <- system.file("examples/SanyalEtal2018.pdf", package="pdfimager")
pdimg_images(h)
#> [[1]]
#> # A tibble: 52 × 17
#>    path      page   num type  width height color  comp   bpc enc   interp object
#>    <chr>    <int> <int> <chr> <int>  <int> <chr> <int> <int> <chr> <chr>   <int>
#>  1 /var/fo…     1     0 image   486    221 cmyk      4     8 jpeg  no        227
#>  2 /var/fo…     1     1 sten…  2744    974 -         1     1 ccitt no        229
#>  3 /var/fo…     1     2 image   178    148 cmyk      4     8 jpeg  no        231
#>  4 /var/fo…     5     3 image   980    946 rgb       3     8 jpeg  no         36
#>  5 /var/fo…     6     4 image    82     76 index     1     8 image no         87
#>  6 /var/fo…     6     5 image    82     76 index     1     8 image no         88
#>  7 /var/fo…     6     6 image    82     76 index     1     8 image no         98
#>  8 /var/fo…     6     7 image    82     76 index     1     8 image no        104
#>  9 /var/fo…     6     8 image    82     76 index     1     8 image no        118
#> 10 /var/fo…     6     9 image    82     76 index     1     8 image no        124
#> # ℹ 42 more rows
#> # ℹ 5 more variables: ID <int>, x.ppi <int>, y.ppi <int>, size <chr>,
#> #   ratio <chr>
#> 

# convert to another format - not entirely sure how this works,
# it's not documented
j <- system.file("examples/SanyalEtal2018.pdf", package="pdfimager")
pdimg_images(j, format = "-png")[[1]]$path[1:5]
#> [1] "/var/folders/py/lcjn3y352g1106vf1rqk521r0000gn/T//RtmpQVrOR8/SanyalEtal2018/img-000.png"
#> [2] "/var/folders/py/lcjn3y352g1106vf1rqk521r0000gn/T//RtmpQVrOR8/SanyalEtal2018/img-001.pbm"
#> [3] "/var/folders/py/lcjn3y352g1106vf1rqk521r0000gn/T//RtmpQVrOR8/SanyalEtal2018/img-002.png"
#> [4] "/var/folders/py/lcjn3y352g1106vf1rqk521r0000gn/T//RtmpQVrOR8/SanyalEtal2018/img-003.png"
#> [5] "/var/folders/py/lcjn3y352g1106vf1rqk521r0000gn/T//RtmpQVrOR8/SanyalEtal2018/img-004.png"