Upgrade to Pro — share decks privately, control downloads, hide ads and more …

Data Manipulation with dplyr (First Steps)

OmaymaS
November 08, 2018

Data Manipulation with dplyr (First Steps)

A workshop for beginners on the #tidyverse, focusing on data manipulation using #dplyr along with hands-on exercises.

Delivered at DataFest Tbilisi 2018.

OmaymaS

November 08, 2018
Tweet

More Decks by OmaymaS

Other Decks in Technology

Transcript

  1. INTRO TO THE TIDYVERSE
    DATA MANIPULATION USING
    OMAYMA SAID
    OmaymaS

    View Slide

  2. The Tidyverse
    Source: https://imgur.com/a/l7fNwP1

    View Slide

  3. The Tidyverse
    Source: https://imgur.com/a/l7fNwP1

    View Slide

  4. id minion leader type age missions_
    internal
    missions_
    external
    101 yellow 5 60 2
    102 yellow 6 55 10
    108 purple 10 48 3
    120 purple 16 49 1
    100 yellow 3 54 4
    > minions
    dataframe/tbl

    View Slide

  5. id minion leader type age missions_
    internal
    missions_
    external
    101 yellow 5 60 2
    102 yellow 6 55 10
    108 purple 10 48 3
    120 purple 16 49 1
    100 yellow 3 54 4
    VARIABLES
    OBSERVATIONS

    View Slide

  6. kevin <-

    View Slide

  7. kevin <-
    kevin_new <- rotate(kevin,
    direction = “clockwise”,
    angle = 90)
    object
    function arguments

    View Slide

  8. Kevin_new <- rotate(kevin,
    direction = “clockwise”,
    angle = 90)
    object
    function arguments
    What is the value of Kevin_new
    ?
    kevin <-

    View Slide

  9. Kevin_new
    kevin <-
    Kevin_new <- rotate(kevin,
    direction = “clockwise”,
    angle = 90)
    object
    function arguments

    View Slide

  10. A grammar of data manipulation

    View Slide

  11. id minion leader type age missions_
    internal
    missions_
    external
    101 yellow 5 60 2
    102 yellow 6 55 10
    108 purple 10 48 3
    120 purple 16 49 1
    100 yellow 3 54 4
    > minions

    View Slide

  12. select()
    Return a subset of columns

    View Slide

  13. select(minions, id, age)
    dataframe
    Columns
    to select

    View Slide

  14. id minion leader type age missions_
    internal
    missions_
    external
    101 yellow 5 60 2
    102 yellow 6 55 10
    108 purple 10 48 3
    120 purple 16 49 1
    100 yellow 3 54 4
    id age
    101 5
    102 6
    108 10
    120 16
    100 3
    select(minions, id, age)
    New dataframe/tbl

    View Slide

  15. select(minions, -missions_external)
    dataframe Column to exclude

    View Slide

  16. id minion leader type age missions_
    internal
    101 yellow 5 60
    102 yellow 6 55
    108 purple 10 48
    120 purple 16 49
    100 yellow 3 54
    select(minions, -missions_external)

    View Slide

  17. select(minions, id:leader)
    dataframe
    Range of
    columns to
    select

    View Slide

  18. id minion leader
    101
    102
    108
    120
    100
    select(minions, id:leader)

    View Slide

  19. filter()
    Return a subset of rows

    View Slide

  20. filter(minions, type == “yellow”)
    dataframe Condition

    View Slide

  21. id minion leader type age missions_
    internal
    missions_e
    xternal
    101 yellow 5 60 2
    102 yellow 6 55 10
    100 yellow 3 54 4
    filter(minions, type == “yellow”)

    View Slide

  22. >
    <
    >=
    <=
    !=
    == equal
    greater than
    less than
    greater than or equal
    less than or equal
    not equal
    MORE CONDITIONS
    &
    |
    AND
    OR
    COMBINE WITH
    ,

    View Slide

  23. filter(minions, type == “yellow”
    , age > 3)
    dataframe Multiple Condition

    View Slide

  24. id minion leader type age missions_
    internal
    missions_e
    xternal
    101 yellow 5 60 2
    102 yellow 6 55 10
    filter(minions, type == “yellow”
    , age > 3)

    View Slide

  25. mutate()
    add/modify columns

    View Slide

  26. mutate(minions, missions = missions_internal+misssions_external)
    dataframe expression
    New
    column
    name

    View Slide

  27. id minion leader type age missions_
    internal
    missions_
    external
    missions
    101 yellow 5 60 2 62
    102 yellow 6 55 10 65
    108 purple 10 48 3 51
    120 purple 16 49 1 50
    100 yellow 3 54 4 58
    mutate(minions, missions = missions_internal+misssions_external)

    View Slide

  28. summarize()
    Calculate aggregate measures for groups

    View Slide

  29. summarize(minions, age_median = median(age))
    expression
    New column
    name
    dataframe

    View Slide

  30. summarize(minions, age_median = median(age))
    age_median
    6
    id minion leader type age missions_
    internal
    missions_
    external
    101 yellow 5 60 2
    102 yellow 6 55 10
    108 purple 10 48 3
    120 purple 16 49 1
    100 yellow 3 54 4

    View Slide

  31. summarize(minions,
    age_median = median(age),
    missions_internal_all = sum(missions_internal),
    missions_external_all = sum(missions_external))
    Multiple expressions

    View Slide

  32. group_by()
    Group by one or more variables

    View Slide

  33. minions %>%
    group_by(leader) %>%
    summarize(missions_internal_all = sum(missions_internal),
    missions_external_all = sum(missions_external))
    New column name Expression
    dataframe group

    View Slide

  34. minions %>%
    group_by(leader) %>%
    summarize(missions_internal_all = sum(missions_internal),
    missions_external_all = sum(missions_external))
    leader missions_internal_all missions_external_all
    169 16
    97 4

    View Slide

  35. arrange()
    Reorder rows based on variables

    View Slide

  36. arrange(minions, missions_internal)
    dataframe Column name

    View Slide

  37. id minion leader type age missions_
    internal
    missions_
    external
    108 purple 10 48 3
    120 purple 16 49 1
    100 yellow 3 54 4
    102 yellow 6 55 10
    101 yellow 5 60 2
    arrange(minions, missions_internal)
    DEFAULT
    Ascending

    View Slide

  38. id minion leader type age missions_
    internal
    missions_
    external
    101 yellow 5 60 2
    102 yellow 6 55 10
    100 yellow 3 54 4
    120 purple 16 49 1
    108 purple 10 48 3
    arrange(minions, desc(missions_internal))

    View Slide

  39. %>%
    The Pipe

    View Slide

  40. <- %>% rotate(“clockwise”, 90)
    object function
    <- rotate( , “clockwise”, 90)
    arguments
    object
    function arguments
    pipe
    =

    View Slide

  41. <- scale( , 0.25)
    1
    Successive commands

    View Slide

  42. <- scale( , 0.25)
    1
    2 <- rotate( , “clockwise”, 90)
    Successive commands

    View Slide

  43. <- scale( , 0.25)
    <- rotate( , “clockwise”, 90)
    <- clone( , 1)
    1
    2
    3
    Successive commands

    View Slide

  44. <- scale( , 0.25)
    1
    2 <- rotate( , “clockwise”, 90)
    <- clone( , 1)
    3
    Successive commands

    View Slide

  45. <- clone(rotate(scale( , 0.25), “clockwise”, 90),1)
    One-line commands

    View Slide

  46. k %>%
    scale(0.25) %>%
    rotate("clockwise", 90) %>%
    clone(1)
    <-
    Piped commands

    View Slide

  47. MISSION ACCOMPLISHED

    View Slide